From e1d34cb24fecb31eb40e7018f0b1cb37ec26f3a2 Mon Sep 17 00:00:00 2001 From: German Date: Mon, 11 Sep 2023 15:22:51 -0400 Subject: [PATCH] SWDEV-404889 - Debugger support, trap handler Add trap handler code into runtime and compile/load during device initialization. The current interface for trap handler in PAL is obsolete and the new interface will be provided later. Change-Id: I1fa702c5d1f2e6731f781369c980d546cf422328 --- rocclr/device/devprogram.cpp | 18 +-- rocclr/device/devprogram.hpp | 4 +- rocclr/device/pal/palblitcl.cpp | 225 ++++++++++++++++++++++++++++++- rocclr/device/pal/paldevice.cpp | 54 ++++++++ rocclr/device/pal/paldevice.hpp | 1 + rocclr/device/pal/palprogram.cpp | 62 +++++---- rocclr/device/pal/palprogram.hpp | 1 + 7 files changed, 329 insertions(+), 36 deletions(-) diff --git a/rocclr/device/devprogram.cpp b/rocclr/device/devprogram.cpp index 493f20b48d..f7e4aac484 100644 --- a/rocclr/device/devprogram.cpp +++ b/rocclr/device/devprogram.cpp @@ -514,8 +514,8 @@ bool Program::compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs, bool Program::compileAndLinkExecutable(const amd_comgr_data_set_t inputs, const std::vector& options, amd::option::Options* amdOptions, - char* executable[], size_t* executableSize) { - + char* executable[], size_t* executableSize, + file_type_t continueCompileFrom) { // create the linked output amd_comgr_action_info_t action; amd_comgr_data_set_t output; @@ -567,16 +567,18 @@ bool Program::compileAndLinkExecutable(const amd_comgr_data_set_t inputs, if (status == AMD_COMGR_STATUS_SUCCESS) { hasRelocatableData = true; - status = amd::Comgr::do_action(AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE, - action, inputs, relocatableData); + amd_comgr_action_kind_t kind = (continueCompileFrom == FILE_TYPE_ASM_TEXT) + ? AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE + : AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE; + status = amd::Comgr::do_action(kind, action, inputs, relocatableData); extractBuildLog(relocatableData); } // Create executable from the relocatable data set amd::Comgr::action_info_set_option_list(action, nullptr, 0); if (status == AMD_COMGR_STATUS_SUCCESS) { - status = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, - action, relocatableData, output); + status = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, action, + relocatableData, output); extractBuildLog(output); } @@ -688,7 +690,7 @@ bool Program::compileImplLC(const std::string& sourceCode, driverOptions.push_back("-mwavefrontsize64"); } driverOptions.push_back("-mcode-object-version=" + std::to_string(options->oVariables->LCCodeObjectVersion)); - + // Iterate through each source code and dump it into tmp std::fstream f; std::vector headerFileNames(headers.size()); @@ -1289,7 +1291,7 @@ bool Program::linkImplLC(amd::option::Options* options) { char* executable = nullptr; size_t executableSize = 0; bool ret = compileAndLinkExecutable(inputs, codegenOptions, options, &executable, - &executableSize); + &executableSize, continueCompileFrom); amd::Comgr::destroy_data_set(inputs); if (!ret) { diff --git a/rocclr/device/devprogram.hpp b/rocclr/device/devprogram.hpp index 78554f0c64..6b71fb8360 100644 --- a/rocclr/device/devprogram.hpp +++ b/rocclr/device/devprogram.hpp @@ -466,8 +466,8 @@ class Program : public amd::HeapObject { //! Compile and create the excutable of the input dataset bool compileAndLinkExecutable(const amd_comgr_data_set_t inputs, - const std::vector& options, amd::option::Options* amdOptions, - char* executable[], size_t* executableSize); + const std::vector& options, amd::option::Options* amdOptions, char* executable[], + size_t* executableSize, file_type_t continueCompileFrom); //! Create the map for the kernel name and its metadata for fast access bool createKernelMetadataMap(void* binary, size_t binSize); diff --git a/rocclr/device/pal/palblitcl.cpp b/rocclr/device/pal/palblitcl.cpp index a60d929152..1fe13aa75b 100644 --- a/rocclr/device/pal/palblitcl.cpp +++ b/rocclr/device/pal/palblitcl.cpp @@ -20,9 +20,9 @@ namespace pal { -#define BLIT_KERNEL(...) #__VA_ARGS__ +#define RUNTIME_KERNEL(...) #__VA_ARGS__ -const char* SchedulerSourceCode = BLIT_KERNEL( +const char* SchedulerSourceCode = RUNTIME_KERNEL( \n extern void __amd_scheduler(__global void*, __global void*, uint); \n @@ -31,7 +31,7 @@ __kernel void __amd_rocclr_scheduler(__global void* queue, __global void* params } \n); -const char* SchedulerSourceCode20 = BLIT_KERNEL( +const char* SchedulerSourceCode20 = RUNTIME_KERNEL( \n extern void __amd_scheduler_pal(__global void*, __global void*, uint); \n @@ -41,4 +41,223 @@ extern void __amd_scheduler_pal(__global void*, __global void*, uint); } \n); +const char* TrapHandlerCode = RUNTIME_KERNEL( +\n.set SQ_WAVE_PC_HI_ADDRESS_MASK , 0xFFFF +\n.set SQ_WAVE_PC_HI_HT_SHIFT , 24 +\n.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT , 16 +\n.set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 8 +\n.set SQ_WAVE_PC_HI_TRAP_ID_BFE, (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) +\n.set SQ_WAVE_STATUS_HALT_SHIFT, 13 +\n.set SQ_WAVE_STATUS_HALT_BFE, (SQ_WAVE_STATUS_HALT_SHIFT | (1 << 16)) +\n.set SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 8 +\n.set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT , 11 +\n.set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT , 28 +\n.set SQ_WAVE_TRAPSTS_MATH_EXCP , 0x7F +\n.set SQ_WAVE_MODE_EXCP_EN_SHIFT , 12 +\n.set TRAP_ID_ABORT , 2 +\n.set TRAP_ID_DEBUGTRAP , 3 +\n.set DOORBELL_ID_SIZE , 10 +\n.set DOORBELL_ID_MASK , ((1 << DOORBELL_ID_SIZE) - 1) +\n.set EC_QUEUE_WAVE_ABORT_M0 , (1 << (DOORBELL_ID_SIZE + 0)) +\n.set EC_QUEUE_WAVE_TRAP_M0 , (1 << (DOORBELL_ID_SIZE + 1)) +\n.set EC_QUEUE_WAVE_MATH_ERROR_M0 , (1 << (DOORBELL_ID_SIZE + 2)) +\n.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 , (1 << (DOORBELL_ID_SIZE + 3)) +\n.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 4)) +\n.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 5)) +\n.set TTMP6_WAVE_STOPPED_SHIFT , 30 +\n.set TTMP6_SAVED_STATUS_HALT_SHIFT , 29 +\n.set TTMP6_SAVED_STATUS_HALT_MASK, (1 << TTMP6_SAVED_STATUS_HALT_SHIFT) +\n.set TTMP6_SAVED_TRAP_ID_SHIFT, 25 +\n.set TTMP6_SAVED_TRAP_ID_SIZE, 4 +\n.set TTMP6_SAVED_TRAP_ID_MASK, (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT) +\n.set TTMP6_SAVED_TRAP_ID_BFE, (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16)) +\n.set TTMP_PC_HI_SHIFT, 7 +\n.set TTMP_DEBUG_ENABLED_SHIFT, 23 +\n.if .amdgcn.gfx_generation_number == 9 +\n.set TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT, 26 +\n.set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, 15 +\n.set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK, 0x1F8000 +\n.elseif .amdgcn.gfx_generation_number == 10 &&.amdgcn.gfx_generation_minor < 3 +\n.set TTMP_SAVE_REPLAY_W64H_SHIFT, 31 +\n.set TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT, 24 +\n.set SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT, 25 +\n.set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, 15 +\n.set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK, 0x3F8000 +\n.set SQ_WAVE_IB_STS_REPLAY_W64H_MASK, 0x2000000 +\n.endif +\n.if .amdgcn.gfx_generation_number == 9 &&.amdgcn.gfx_generation_minor >= 4 +\n.set TTMP11_TTMPS_SETUP_SHIFT, 31 +\n.endif +// ABI between first and second level trap handler: +// ttmp0 = PC[31:0] +// ttmp12 = SQ_WAVE_STATUS +// ttmp14 = TMA[31:0] +// ttmp15 = TMA[63:32] +// gfx9: +// ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32] +// gfx906/gfx908/gfx90a: +// ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] +// gfx940/gfx941/gfx942: +// ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0] +// gfx10: +// ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32] +// gfx1010: +// ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] +// gfx1030/gfx1100: +// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] +\n .globl trap_entry +\n .type trap_entry,@function +\ntrap_entry: + // Branch if not a trap (an exception instead). +\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE +\n s_cbranch_scc0 .no_skip_debugtrap + // If caused by s_trap then advance PC. +\n s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT +\n s_cbranch_scc1 .not_s_trap +\n s_add_u32 ttmp0, ttmp0, 0x4 +\n s_addc_u32 ttmp1, ttmp1, 0x0 +\n.not_s_trap: + // If llvm.debugtrap and debugger is not attached. +\n s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP +\n s_cbranch_scc0 .no_skip_debugtrap +\n.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) || .amdgcn.gfx_generation_number == 10 +\n s_bitcmp0_b32 ttmp11, TTMP_DEBUG_ENABLED_SHIFT +\n.else +\n s_bitcmp0_b32 ttmp13, TTMP_DEBUG_ENABLED_SHIFT +\n.endif +\n s_cbranch_scc0 .no_skip_debugtrap + // Ignore llvm.debugtrap. +\n s_branch .exit_trap +\n.no_skip_debugtrap: + // Save trap id and halt status in ttmp6. +\n s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) +\n s_min_u32 ttmp2, ttmp2, 0xF +\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT +\n s_or_b32 ttmp6, ttmp6, ttmp2 +\n s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE +\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT +\n s_or_b32 ttmp6, ttmp6, ttmp2 + // Fetch doorbell id for our queue. +\n.if .amdgcn.gfx_generation_number < 11 +\n s_mov_b32 ttmp2, exec_lo +\n s_mov_b32 ttmp3, exec_hi +\n s_mov_b32 exec_lo, 0x80000000 +\n s_sendmsg sendmsg(MSG_GET_DOORBELL) +\n.wait_sendmsg: +\n s_nop 0x7 +\n s_bitcmp0_b32 exec_lo, 0x1F +\n s_cbranch_scc0 .wait_sendmsg +\n s_mov_b32 exec_hi, ttmp3 + // Restore exec_lo, move the doorbell_id into ttmp3 +\n s_and_b32 ttmp3, exec_lo, DOORBELL_ID_MASK +\n s_mov_b32 exec_lo, ttmp2 +\n.else +\n s_sendmsg_rtn_b32 ttmp3, sendmsg(MSG_RTN_GET_DOORBELL) +\n s_waitcnt lgkmcnt(0) +\n s_and_b32 ttmp3, ttmp3, DOORBELL_ID_MASK +\n.endif + // Map trap reason to an exception code. +\n s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) +\n +\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT +\n s_cbranch_scc0 .not_memory_violation +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 + // Aperture violation requires XNACK_ERROR == 0. +\n s_branch .not_aperture_violation +\n.not_memory_violation: +\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT +\n s_cbranch_scc0 .not_aperture_violation +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 +\n.not_aperture_violation: +\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT +\n s_cbranch_scc0 .not_illegal_instruction +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 +\n.not_illegal_instruction: +\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_MATH_EXCP +\n s_cbranch_scc0 .not_math_exception +\n s_getreg_b32 ttmp7, hwreg(HW_REG_MODE) +\n s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT +\n s_and_b32 ttmp2, ttmp2, ttmp7 +\n s_cbranch_scc0 .not_math_exception +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0 +\n.not_math_exception: +\n s_bfe_u32 ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE +\n s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT +\n s_cbranch_scc0 .not_abort_trap +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0 +\n.not_abort_trap: + // If no other exception was flagged then report a generic error. +\n s_andn2_b32 ttmp2, ttmp3, DOORBELL_ID_MASK +\n s_cbranch_scc1 .send_interrupt +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 +\n.send_interrupt: + // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id +\n s_mov_b32 ttmp2, m0 +\n s_mov_b32 m0, ttmp3 +\n s_nop 0x0 // Manually inserted wait states +\n s_sendmsg sendmsg(MSG_INTERRUPT) +\n s_waitcnt lgkmcnt(0) // Wait for the message to go out. +\n s_mov_b32 m0, ttmp2 + // Parking the wave requires saving the original pc in the preserved ttmps. + // Register layout before parking the wave: + // + // ttmp7: 0[31:0] + // ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0] + // + // After parking the wave: + // + // ttmp7: pc_lo[31:0] + // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0] +\n.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) || (.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3) +\n // Save the PC +\n s_mov_b32 ttmp7, ttmp0 +\n s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK +\n s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT +\n s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT) +\n s_or_b32 ttmp11, ttmp11, ttmp1 + // Park the wave +\n s_getpc_b64 [ttmp0, ttmp1] +\n s_add_u32 ttmp0, ttmp0, .parked - . +\n s_addc_u32 ttmp1, ttmp1, 0x0 +\n.endif +\n.halt_wave: + // Halt the wavefront upon restoring STATUS below. +\n s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT +\n s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_HALT_SHIFT +\n.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4) +\n s_bitcmp1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT +\n s_cbranch_scc1 .ttmps_initialized +\n s_mov_b32 ttmp4, 0 +\n s_mov_b32 ttmp5, 0 +\n s_bitset1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT +\n.ttmps_initialized: +\n.endif +\n.exit_trap: + // Restore SQ_WAVE_IB_STS. +\n.if .amdgcn.gfx_generation_number == 9 +\n.if .amdgcn.gfx_generation_minor < 4 +\n s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) +\n.else +\n s_lshr_b32 ttmp2, ttmp13, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) +\n.endif +\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK +\n s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 +\n.elseif .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3 +\n s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) +\n s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK +\n s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) +\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK +\n s_or_b32 ttmp2, ttmp2, ttmp3 +\n s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 +\n.endif + // Restore SQ_WAVE_STATUS. +\n s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 +\n s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 +\n s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12 + // Return to original (possibly modified) PC. +\n s_rfe_b64 [ttmp0, ttmp1] +\n.parked: +\n s_trap 0x2 +\n s_branch .parked +\n); } // namespace pal diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index 09f9fe4e28..2a49fd282d 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -816,6 +816,10 @@ Device::~Device() { // Destroy transfer queue delete xferQueue_; + if (trap_handler_ != nullptr) { + trap_handler_->release(); + } + // Destroy blit program delete blitProgram_; @@ -847,6 +851,7 @@ Device::~Device() { extern const char* SchedulerSourceCode; extern const char* SchedulerSourceCode20; +extern const char* TrapHandlerCode; Pal::IDevice* gDeviceList[Pal::MaxDevices] = {}; uint32_t gStartDevice = 0; @@ -1197,6 +1202,32 @@ bool Device::initializeHeapResources() { return false; } xferQueue_->enableSyncedBlit(); + // Setup trap handler if available + if (trap_handler_ != nullptr) { + auto program = reinterpret_cast( + trap_handler_->getDeviceProgram(*this)); + if (program != nullptr) { + Pal::Result result{Pal::Result::Success}; + Pal::GpuMemoryRef memRef = {}; + memRef.pGpuMemory = program->codeSegGpu().iMem(); + if (!settings().alwaysResident_) { + // Make sure trap handler is always resident in memory + // note: this code path is for OpenCL only, since HIP has alwaysResident_ enabled + result = iDev()->AddGpuMemoryReferences(1, &memRef, nullptr, Pal::GpuMemoryRefCantTrim); + } + if (result == Pal::Result::Success) { + // Find an offset in memory for the trap handler. + // Loader returns an absolute address, but PAL accepts base + offset, hense find offset + auto offset = program->GetTrapHandlerAddress() - memRef.pGpuMemory->Desc().gpuVirtAddr; +#ifdef PAL_DEBUGGER + // Bind trap handler to the kernel mode driver + iDev()->BindTrapHandler(Pal::PipelineBindPoint::Compute, memRef.pGpuMemory, offset); +#endif + } else { + LogError("Failed to make trap handler resident in memory"); + } + } + } } return true; } @@ -2552,6 +2583,29 @@ bool Device::createBlitProgram() { LogError("Couldn't create blit kernels!"); result = false; } + +#ifdef PAL_DEBUGGER + if (settings().useLightning_) { + const std::string TrapHandlerAsm = TrapHandlerCode; + // Create a program for trap handler + // note: It's not critical for runtime functionality to fail trap handler initialization + trap_handler_ = new amd::Program(*context_, TrapHandlerAsm.c_str(), amd::Program::Assembly); + if (trap_handler_ != nullptr) { + std::vector devices; + devices.push_back(this); + std::string opt = "-cl-internal-kernel "; + if (auto retval = + trap_handler_->build(devices, opt.c_str(), nullptr, nullptr, false) != CL_SUCCESS) { + DevLogPrintfError("Build failed for trap handler with error code: %d\n", retval); + } + if (!trap_handler_->load()) { + DevLogPrintfError("Could not load the trap handler \n"); + } + } else { + DevLogPrintfError("Trap handler creation failed\n"); + } + } +#endif return result; } diff --git a/rocclr/device/pal/paldevice.hpp b/rocclr/device/pal/paldevice.hpp index 943ffec322..764bea98ab 100644 --- a/rocclr/device/pal/paldevice.hpp +++ b/rocclr/device/pal/paldevice.hpp @@ -713,6 +713,7 @@ class Device : public NullDevice { Pal::GpuMemoryHeapProperties heaps_[Pal::GpuHeapCount]; //!< Information about heaps, returned from PAL std::map queue_pool_; //!< Pool of PAL queues for recycling + amd::Program* trap_handler_ = nullptr; //!< Trap handler program for debugger setup }; /*@}*/ // namespace pal diff --git a/rocclr/device/pal/palprogram.cpp b/rocclr/device/pal/palprogram.cpp index 70da1029b0..05eba432b7 100644 --- a/rocclr/device/pal/palprogram.cpp +++ b/rocclr/device/pal/palprogram.cpp @@ -278,7 +278,7 @@ bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWo size_t kernelNamesSize = 0; acl_error errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_, - RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize); + RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; return false; @@ -286,7 +286,7 @@ bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWo if (kernelNamesSize > 0) { std::vector kernelNames(kernelNamesSize); errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES, - nullptr, kernelNames.data(), &kernelNamesSize); + nullptr, kernelNames.data(), &kernelNamesSize); if (errorCode != ACL_SUCCESS) { buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; return false; @@ -737,30 +737,34 @@ bool LightningProgram::createBinary(amd::option::Options* options) { return true; } +// ================================================================================================ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize, bool internalKernel) { #if defined(USE_COMGR_LIBRARY) - // Find the size of global variables from the binary - if (!FindGlobalVarSize(binary, binSize)) { - buildLog_ += "Error: Cannot Find Global Var Sizes\n"; - return false; - } - - for (const auto& kernelMeta : kernelMetadataMap_) { - auto kernelName = kernelMeta.first; - auto kernel = new LightningKernel(kernelName, this, internalKernel); - if (kernel == nullptr) { + // Skip metadata look-up and kernel creation for assembly and internal kernel. + // @note: Runtime compiles only the second level trap handler from assembly + if ((owner()->language() != amd::Program::Assembly) || !internal_) { + // Find the size of global variables from the binary + if (!FindGlobalVarSize(binary, binSize)) { + buildLog_ += "Error: Cannot Find Global Var Sizes\n"; return false; } - if (!kernel->init()) { - buildLog_ += "[ROC][Kernel] Could not get Code Prop Meta Data \n"; - return false; + + for (const auto& kernelMeta : kernelMetadataMap_) { + auto kernelName = kernelMeta.first; + auto kernel = new LightningKernel(kernelName, this, internalKernel); + if (kernel == nullptr) { + return false; + } + if (!kernel->init()) { + buildLog_ += "[ROC][Kernel] Could not get Code Prop Meta Data \n"; + return false; + } + kernels()[kernelName] = kernel; + + kernel->setUniformWorkGroupSize(useUniformWorkGroupSize); } - kernels()[kernelName] = kernel; - - kernel->setUniformWorkGroupSize(useUniformWorkGroupSize); } - executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, nullptr); if (executable_ == nullptr) { LogError("Error: Executable for AMD HSA Code Object isn't created."); @@ -787,13 +791,14 @@ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUnifo return true; } -bool LightningProgram::setKernels(void* binary, size_t binSize, - amd::Os::FileDesc fdesc, size_t foffset, std::string uri) { +// ================================================================================================ +bool LightningProgram::setKernels(void* binary, size_t binSize, amd::Os::FileDesc fdesc, + size_t foffset, std::string uri) { #if defined(USE_COMGR_LIBRARY) // Collect the information about compiled binary if (!isNull() && (palDevice().rgpCaptureMgr() != nullptr)) { - apiHash_ = palDevice().rgpCaptureMgr()->AddElfBinary(binary, binSize, binary, binSize, - codeSegGpu_->iMem(), codeSegGpu_->offset()); + apiHash_ = palDevice().rgpCaptureMgr()->AddElfBinary( + binary, binSize, binary, binSize, codeSegGpu_->iMem(), codeSegGpu_->offset()); } for (auto& kit : kernels()) { @@ -812,4 +817,15 @@ bool LightningProgram::setKernels(void* binary, size_t binSize, return true; } +// ================================================================================================ +uint64_t LightningProgram::GetTrapHandlerAddress() const { + uint64_t address = 0; + hsa_agent_t agent = {amd::Device::toHandle(&(device()))}; + auto trap_sym = executable_->GetSymbol("trap_entry", &agent); + if (trap_sym != nullptr) { + trap_sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &address); + } + return address; +} + } // namespace pal diff --git a/rocclr/device/pal/palprogram.hpp b/rocclr/device/pal/palprogram.hpp index afbe857052..b70b4d56c2 100644 --- a/rocclr/device/pal/palprogram.hpp +++ b/rocclr/device/pal/palprogram.hpp @@ -269,6 +269,7 @@ class LightningProgram : public HSAILProgram { isHIP_ = (owner.language() == amd::Program::HIP); } virtual ~LightningProgram() {} + uint64_t GetTrapHandlerAddress() const; protected: virtual bool createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,