From 7d661bc7dfcd35ecc84cad084d3584743b0cefeb Mon Sep 17 00:00:00 2001 From: German Date: Wed, 22 Nov 2023 14:25:25 -0500 Subject: [PATCH] SWDEV-404889 - Enable debugger interface in PAL Add GPU_DEBUG_ENABLE to control ttpm behavior. If enabled, then HW will collect more debug info at some perf cost Change-Id: Icee0686b903a7b1bd483710b9d611877cd43c6aa --- rocclr/device/pal/paldevice.cpp | 38 ++++++++++++++++++-------------- rocclr/device/pal/paldevice.hpp | 5 +++++ rocclr/device/pal/palvirtual.cpp | 6 ----- rocclr/device/pal/palvirtual.hpp | 3 --- rocclr/utils/flags.hpp | 2 ++ 5 files changed, 28 insertions(+), 26 deletions(-) diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index 950e29f50b..5f29fa4460 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -1141,15 +1141,15 @@ bool Device::initializeHeapResources() { if (iDev()->Finalize(finalizeInfo) != Pal::Result::Success) { return false; } -#ifdef PAL_DEBUGGER - Pal::RuntimeSetup setup; - setup.r_debug = reinterpret_cast(_amdgpu_r_debug_ptr); - if (iDev()->RegisterRuntimeState(&setup) != Pal::Result::Success) { + Pal::HipRuntimeSetup setup {.pRdebug = _amdgpu_r_debug_ptr, + .runtimeState = 1, // Always valid debug state + .ttmpSetupHint = GPU_DEBUG_ENABLE}; + setup.pRdebug = _amdgpu_r_debug_ptr; + if (iDev()->RegisterHipRuntimeState(setup) != Pal::Result::Success) { LogError("Couldn't register debug state from the loader!"); // Note: ignore debug state error, since it's not a critical // error for the execution } -#endif heapInitComplete_ = true; @@ -1211,10 +1211,11 @@ bool Device::initializeHeapResources() { // Find an offset in memory for the trap handler. // Loader returns an absolute address, but PAL accepts base + offset, hense find offset auto offset = program->GetTrapHandlerAddress() - memRef.pGpuMemory->Desc().gpuVirtAddr; -#ifdef PAL_DEBUGGER - // Bind trap handler to the kernel mode driver - iDev()->BindTrapHandler(Pal::PipelineBindPoint::Compute, memRef.pGpuMemory, offset); -#endif + // Bind the trap handler's executable to the kernel mode driver + result = iDev()->SetHipTrapHandler(memRef.pGpuMemory, offset, nullptr, 0); + if (result != Pal::Result::Success) { + LogError("KMD failed to setup the trap handler"); + } } else { LogError("Failed to make trap handler resident in memory"); } @@ -2607,28 +2608,31 @@ bool Device::createBlitProgram() { result = false; } -#ifdef PAL_DEBUGGER if (settings().useLightning_) { const std::string TrapHandlerAsm = TrapHandlerCode; // Create a program for trap handler // note: It's not critical for runtime functionality to fail trap handler initialization - trap_handler_ = new amd::Program(*context_, TrapHandlerAsm.c_str(), amd::Program::Assembly); - if (trap_handler_ != nullptr) { + auto asm_program = new amd::Program(*context_, TrapHandlerAsm.c_str(), amd::Program::Assembly); + if (asm_program != nullptr) { std::vector devices; devices.push_back(this); std::string opt = "-cl-internal-kernel "; if (auto retval = - trap_handler_->build(devices, opt.c_str(), nullptr, nullptr, false) != CL_SUCCESS) { + asm_program->build(devices, opt.c_str(), nullptr, nullptr, false) != CL_SUCCESS) { DevLogPrintfError("Build failed for trap handler with error code: %d\n", retval); - } - if (!trap_handler_->load()) { - DevLogPrintfError("Could not load the trap handler \n"); + asm_program->release(); + } else { + if (asm_program->load()) { + trap_handler_ = asm_program; + } else { + DevLogPrintfError("Could not load the trap handler \n"); + asm_program->release(); + } } } else { DevLogPrintfError("Trap handler creation failed\n"); } } -#endif return result; } diff --git a/rocclr/device/pal/paldevice.hpp b/rocclr/device/pal/paldevice.hpp index 0461751cef..a843a02174 100644 --- a/rocclr/device/pal/paldevice.hpp +++ b/rocclr/device/pal/paldevice.hpp @@ -247,6 +247,11 @@ class Device : public NullDevice { AqlPacketMgmt aql_packet_mgmt_; //!< AQL packets management class for debugger support QueueRecycleInfo() : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0), queue_lock_("Queue lock for sharing", true) {} + + //! Returns the aql packet list + uintptr_t AqlPacketList() const { + return reinterpret_cast(&aql_packet_mgmt_.aql_packets_); + } }; //! Locks any access to the virtual GPUs diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp index 56b734bced..f3131e611c 100644 --- a/rocclr/device/pal/palvirtual.cpp +++ b/rocclr/device/pal/palvirtual.cpp @@ -156,9 +156,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que return nullptr; } addrQ = reinterpret_cast
(&info[1]); -#ifdef PAL_DEBUGGER qCreateInfo.aqlPacketList = info->AqlPacketList(); -#endif result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_); if (result == Pal::Result::Success) { const_cast(gpu.dev()).QueuePool().insert({queue->iQueue_, info}); @@ -203,9 +201,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que queue->aql_mgmt_ = &info->aql_packet_mgmt_; // Exclusive compute path addrQ = reinterpret_cast
(&queue[1]); -#ifdef PAL_DEBUGGER qCreateInfo.aqlPacketList = info->AqlPacketList(); -#endif result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_); } if (result != Pal::Result::Success) { @@ -2668,9 +2664,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, dispatchParam.wavesPerSh = 0; dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false; dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize(); -#ifdef PAL_DEBUGGER dispatchParam.aqlPacketIndex = aql_index; -#endif // Run AQL dispatch in HW eventBegin(MainEngine); iCmd()->CmdDispatchAql(dispatchParam); diff --git a/rocclr/device/pal/palvirtual.hpp b/rocclr/device/pal/palvirtual.hpp index c43b7be6be..8086e5eb2c 100644 --- a/rocclr/device/pal/palvirtual.hpp +++ b/rocclr/device/pal/palvirtual.hpp @@ -60,9 +60,6 @@ struct AqlPacketMgmt : public amd::EmbeddedObject { memset(aql_vgpus_, 0, sizeof(aql_vgpus_)); } - //! Returns the aql packet list - uintptr_t AqlPacketList() const { return reinterpret_cast(&aql_packets_); } - hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize]; //!< The list of AQL packets GpuEvent aql_events_[kAqlPacketsListSize]; //!< The list of gpu for each AQL packet VirtualGPU* aql_vgpus_[kAqlPacketsListSize]; //!< The list of vgpus which had submissions diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp index bd8c32e25e..51ebdd0d39 100644 --- a/rocclr/utils/flags.hpp +++ b/rocclr/utils/flags.hpp @@ -237,6 +237,8 @@ release(bool, HIP_FORCE_DEV_KERNARG, 0, \ "Force device mem for kernel args.") \ release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, false, \ "Enable/Disable graph packet capturing") \ +release(bool, GPU_DEBUG_ENABLE, false, \ + "Enables collection of extra info for debugger at some perf cost") \ release(cstring, HIPRTC_COMPILE_OPTIONS_APPEND, "", \ "Set compile options needed for hiprtc compilation") \ release(cstring, HIPRTC_LINK_OPTIONS_APPEND, "", \