diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index 5c77946d90..b03f936d8d 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -1125,6 +1125,14 @@ bool Device::initializeHeapResources() { } } + // Update RGP capture manager + if (rgpCaptureMgr_ != nullptr) { + if (!rgpCaptureMgr_->Update(platform_)) { + delete rgpCaptureMgr_; + rgpCaptureMgr_ = nullptr; + } + } + // Create a synchronized transfer queue xferQueue_ = new VirtualGPU(*this); if (!(xferQueue_ && xferQueue_->create(false))) { @@ -1136,14 +1144,6 @@ bool Device::initializeHeapResources() { return false; } xferQueue_->enableSyncedBlit(); - - // Update RGP capture manager - if (rgpCaptureMgr_ != nullptr) { - if (!rgpCaptureMgr_->Update(platform_)) { - delete rgpCaptureMgr_; - rgpCaptureMgr_ = nullptr; - } - } } return true; } diff --git a/rocclr/device/pal/palgpuopen.cpp b/rocclr/device/pal/palgpuopen.cpp index a03418a843..8437bc0600 100644 --- a/rocclr/device/pal/palgpuopen.cpp +++ b/rocclr/device/pal/palgpuopen.cpp @@ -76,6 +76,28 @@ RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& dev return mgr; } +// ================================================================================================ +uint64_t RgpCaptureMgr::AddElfBinary(const void* exe_binary, size_t exe_binary_size, + const void* elf_binary, size_t elf_binary_size, + Pal::IGpuMemory* pGpuMemory, size_t offset) { + GpuUtil::ElfBinaryInfo elfBinaryInfo = {}; + elfBinaryInfo.pBinary = exe_binary; + elfBinaryInfo.binarySize = exe_binary_size; ///< FAT Elf binary size. + elfBinaryInfo.pGpuMemory = pGpuMemory; ///< GPU Memory where the compiled ISA resides. + elfBinaryInfo.offset = static_cast(offset); + + elfBinaryInfo.originalHash = DevDriver::MetroHash::MetroHash64( + reinterpret_cast(elf_binary), elf_binary_size); + + elfBinaryInfo.compiledHash = DevDriver::MetroHash::MetroHash64( + reinterpret_cast(exe_binary), exe_binary_size); + + assert(trace_.gpa_session_ != nullptr); + + trace_.gpa_session_->RegisterElfBinary(elfBinaryInfo); + return elfBinaryInfo.originalHash; +} + // ================================================================================================ bool RgpCaptureMgr::Init(Pal::IPlatform* platform) { if (dev_driver_server_ == nullptr) { @@ -413,6 +435,9 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size } } } + // Write the hash value + WriteComputeBindMarker(gpu, kernel.prog().ApiHash()); + WriteUserEventMarker(gpu, RgpSqttMarkerUserEventObjectName, kernel.name()); // Write disaptch marker WriteEventWithDimsMarker(gpu, apiEvent, static_cast(x), static_cast(y), @@ -893,6 +918,19 @@ void RgpCaptureMgr::WriteUserEventMarker(const VirtualGPU* gpu, WriteMarker(gpu, user_event_, markerSize); } +// ================================================================================================ +// Inserts a compute bind marker +void RgpCaptureMgr::WriteComputeBindMarker(const VirtualGPU* gpu, uint64_t api_hash) const { + RgpSqttMarkerPipelineBind marker = {}; + + marker.identifier = RgpSqttMarkerIdentifierBindPipeline; + marker.cbID = gpu->queue(MainEngine).cmdBufId();; + marker.bindPoint = 1; + + memcpy(marker.apiPsoHash, &api_hash, sizeof(api_hash)); + WriteMarker(gpu, &marker, sizeof(marker)); +} + } // namespace pal #endif // PAL_GPUOPEN_OCL diff --git a/rocclr/device/pal/palgpuopen.hpp b/rocclr/device/pal/palgpuopen.hpp index 9705f67478..6396c44082 100644 --- a/rocclr/device/pal/palgpuopen.hpp +++ b/rocclr/device/pal/palgpuopen.hpp @@ -68,6 +68,7 @@ class ICmdBuffer; class IFence; class IQueueSemaphore; struct PalPublicSettings; +class IGPuMemory; } // namespace Pal // GPUOpen forward declarations @@ -91,22 +92,22 @@ namespace pal { // ================================================================================================ // RgpSqttMarkerIdentifier - Identifiers for RGP SQ thread-tracing markers (Table 1) enum RgpSqttMarkerIdentifier : uint32_t { - RgpSqttMarkerIdentifierEvent = 0x0, - RgpSqttMarkerIdentifierCbStart = 0x1, - RgpSqttMarkerIdentifierCbEnd = 0x2, - RgpSqttMarkerIdentifierBarrierStart = 0x3, - RgpSqttMarkerIdentifierBarrierEnd = 0x4, - RgpSqttMarkerIdentifierUserEvent = 0x5, - RgpSqttMarkerIdentifierGeneralApi = 0x6, - RgpSqttMarkerIdentifierSync = 0x7, - RgpSqttMarkerIdentifierPresent = 0x8, - RgpSqttMarkerIdentifierLayoutTransition = 0x9, - RgpSqttMarkerIdentifierRenderPass = 0xA, - RgpSqttMarkerIdentifierReserved2 = 0xB, - RgpSqttMarkerIdentifierReserved3 = 0xC, - RgpSqttMarkerIdentifierReserved4 = 0xD, - RgpSqttMarkerIdentifierReserved5 = 0xE, - RgpSqttMarkerIdentifierReserved6 = 0xF + RgpSqttMarkerIdentifierEvent = 0x0, + RgpSqttMarkerIdentifierCbStart = 0x1, + RgpSqttMarkerIdentifierCbEnd = 0x2, + RgpSqttMarkerIdentifierBarrierStart = 0x3, + RgpSqttMarkerIdentifierBarrierEnd = 0x4, + RgpSqttMarkerIdentifierUserEvent = 0x5, + RgpSqttMarkerIdentifierGeneralApi = 0x6, + RgpSqttMarkerIdentifierSync = 0x7, + RgpSqttMarkerIdentifierPresent = 0x8, + RgpSqttMarkerIdentifierLayoutTransition = 0x9, + RgpSqttMarkerIdentifierRenderPass = 0xA, + RgpSqttMarkerIdentifierReserved2 = 0xB, + RgpSqttMarkerIdentifierBindPipeline = 0xC, + RgpSqttMarkerIdentifierReserved4 = 0xD, + RgpSqttMarkerIdentifierReserved5 = 0xE, + RgpSqttMarkerIdentifierReserved6 = 0xF }; // ================================================================================================ @@ -238,6 +239,33 @@ struct RgpSqttMarkerBarrierEnd { }; }; +// ================================================================================================ +// RgpSqttMarkerPipelineBind - RGP SQ thread-tracing marker written whenever a pipeline is bound (Table 12). +struct RgpSqttMarkerPipelineBind { + union { + struct { + uint32_t identifier : 4; // Identifier for this marker + uint32_t extDwords : 3; // Number of extra dwords following this marker + uint32_t bindPoint : 1; // The bind point of the pipeline within a queue + // 0 = graphics bind point + // 1 = compute bind point + uint32_t cbID : 20; // A command buffer ID encoded as per Table 13. + uint32_t reserved : 4; // Reserved + }; + + uint32_t dword01; // The first dword + }; + + union { + uint32_t apiPsoHash[2]; // The API PSO hash of the pipeline being bound + struct { + uint32_t dword02; // The second dword + uint32_t dword03; // The third dword + }; + }; +}; + + // RGP SQTT Instrumentation Specification version (API-independent) constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1; @@ -312,7 +340,8 @@ class RgpCaptureMgr { Pal::Result TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId, const Pal::SubmitInfo& submitInfo) const; bool Update(Pal::IPlatform* platform); - + uint64_t AddElfBinary(const void* exe_binary, size_t exe_binary_size, const void* elf_binary, + size_t elf_binary_size, Pal::IGpuMemory* pGpuMemory, size_t offset); private: // Steps that an RGP trace goes through enum class TraceStatus { @@ -360,6 +389,7 @@ class RgpCaptureMgr { uint32_t y, uint32_t z) const; void WriteUserEventMarker(const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType, const std::string& name) const; + void WriteComputeBindMarker(const VirtualGPU* gpu, uint64_t api_hash) const; const Device& device_; DevDriver::DevDriverServer* dev_driver_server_; @@ -411,8 +441,14 @@ class RgpCaptureMgr { void PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y, size_t z) {} void PostDispatch(VirtualGPU* gpu) {} void FinishRGPTrace(VirtualGPU* gpu, bool aborted) {} - bool RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const { return true; } + bool RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const { + return true; + } bool Update(Pal::IPlatform* platform) const { return true; } + bool AddElfBinary(const void* exe_binary, size_t exe_binary_size, const void* elf_binary, + size_t elf_binary_size, Pal::IGpuMemory* pGpuMemory, size_t offset) { + return true; + } }; } // namespace pal #endif // PAL_GPUOPEN_OCL diff --git a/rocclr/device/pal/palprogram.cpp b/rocclr/device/pal/palprogram.cpp index 5d7a5da542..c2d2b8f5d5 100644 --- a/rocclr/device/pal/palprogram.cpp +++ b/rocclr/device/pal/palprogram.cpp @@ -802,6 +802,12 @@ bool LightningProgram::setKernels(void* binary, size_t binSize, return true; } + // Collect the information about compiled binary + if (palDevice().rgpCaptureMgr() != nullptr) { + apiHash_ = palDevice().rgpCaptureMgr()->AddElfBinary(binary, binSize, binary, binSize, + codeSegGpu_->iMem(), codeSegGpu_->offset()); + } + for (auto& kit : kernels()) { LightningKernel* kernel = static_cast(kit.second); if (!kernel->postLoad()) { diff --git a/rocclr/device/pal/palprogram.hpp b/rocclr/device/pal/palprogram.hpp index 69a8ce6fb6..e5b4942cb6 100644 --- a/rocclr/device/pal/palprogram.hpp +++ b/rocclr/device/pal/palprogram.hpp @@ -200,6 +200,9 @@ class HSAILProgram : public device::Program { return executable_->GetSymbol(symbol_name, agent); } + //! Returns API hash value of the program for RGP thread trace + uint64_t ApiHash() const { return apiHash_; } + protected: bool saveBinaryAndSetType(type_t type); @@ -246,6 +249,7 @@ class HSAILProgram : public device::Program { //!< in the program by individual kernel uint maxVgprs_; //!< Maximum number of VGPR(s) used //!< in the program by individual kernel + uint64_t apiHash_ = 0; //!< API hash value for RGP thread trace std::list staticSamplers_; //!< List od internal static samplers