SWDEV-404889 - Debugger support, trap handler
Add trap handler code into runtime and compile/load during
device initialization. The current interface for trap handler in
PAL is obsolete and the new interface will be provided later.
Change-Id: I1fa702c5d1f2e6731f781369c980d546cf422328
[ROCm/clr commit: e1d34cb24f]
Этот коммит содержится в:
коммит произвёл
German Andryeyev
родитель
7d69dd7598
Коммит
c3d48d80fa
@@ -514,8 +514,8 @@ bool Program::compileToLLVMBitcode(const amd_comgr_data_set_t compileInputs,
|
||||
bool Program::compileAndLinkExecutable(const amd_comgr_data_set_t inputs,
|
||||
const std::vector<std::string>& options,
|
||||
amd::option::Options* amdOptions,
|
||||
char* executable[], size_t* executableSize) {
|
||||
|
||||
char* executable[], size_t* executableSize,
|
||||
file_type_t continueCompileFrom) {
|
||||
// create the linked output
|
||||
amd_comgr_action_info_t action;
|
||||
amd_comgr_data_set_t output;
|
||||
@@ -567,16 +567,18 @@ bool Program::compileAndLinkExecutable(const amd_comgr_data_set_t inputs,
|
||||
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
hasRelocatableData = true;
|
||||
status = amd::Comgr::do_action(AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE,
|
||||
action, inputs, relocatableData);
|
||||
amd_comgr_action_kind_t kind = (continueCompileFrom == FILE_TYPE_ASM_TEXT)
|
||||
? AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE
|
||||
: AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE;
|
||||
status = amd::Comgr::do_action(kind, action, inputs, relocatableData);
|
||||
extractBuildLog(relocatableData);
|
||||
}
|
||||
|
||||
// Create executable from the relocatable data set
|
||||
amd::Comgr::action_info_set_option_list(action, nullptr, 0);
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
status = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
|
||||
action, relocatableData, output);
|
||||
status = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, action,
|
||||
relocatableData, output);
|
||||
extractBuildLog(output);
|
||||
}
|
||||
|
||||
@@ -688,7 +690,7 @@ bool Program::compileImplLC(const std::string& sourceCode,
|
||||
driverOptions.push_back("-mwavefrontsize64");
|
||||
}
|
||||
driverOptions.push_back("-mcode-object-version=" + std::to_string(options->oVariables->LCCodeObjectVersion));
|
||||
|
||||
|
||||
// Iterate through each source code and dump it into tmp
|
||||
std::fstream f;
|
||||
std::vector<std::string> headerFileNames(headers.size());
|
||||
@@ -1289,7 +1291,7 @@ bool Program::linkImplLC(amd::option::Options* options) {
|
||||
char* executable = nullptr;
|
||||
size_t executableSize = 0;
|
||||
bool ret = compileAndLinkExecutable(inputs, codegenOptions, options, &executable,
|
||||
&executableSize);
|
||||
&executableSize, continueCompileFrom);
|
||||
amd::Comgr::destroy_data_set(inputs);
|
||||
|
||||
if (!ret) {
|
||||
|
||||
@@ -466,8 +466,8 @@ class Program : public amd::HeapObject {
|
||||
|
||||
//! Compile and create the excutable of the input dataset
|
||||
bool compileAndLinkExecutable(const amd_comgr_data_set_t inputs,
|
||||
const std::vector<std::string>& options, amd::option::Options* amdOptions,
|
||||
char* executable[], size_t* executableSize);
|
||||
const std::vector<std::string>& options, amd::option::Options* amdOptions, char* executable[],
|
||||
size_t* executableSize, file_type_t continueCompileFrom);
|
||||
|
||||
//! Create the map for the kernel name and its metadata for fast access
|
||||
bool createKernelMetadataMap(void* binary, size_t binSize);
|
||||
|
||||
@@ -20,9 +20,9 @@
|
||||
|
||||
namespace pal {
|
||||
|
||||
#define BLIT_KERNEL(...) #__VA_ARGS__
|
||||
#define RUNTIME_KERNEL(...) #__VA_ARGS__
|
||||
|
||||
const char* SchedulerSourceCode = BLIT_KERNEL(
|
||||
const char* SchedulerSourceCode = RUNTIME_KERNEL(
|
||||
\n
|
||||
extern void __amd_scheduler(__global void*, __global void*, uint);
|
||||
\n
|
||||
@@ -31,7 +31,7 @@ __kernel void __amd_rocclr_scheduler(__global void* queue, __global void* params
|
||||
}
|
||||
\n);
|
||||
|
||||
const char* SchedulerSourceCode20 = BLIT_KERNEL(
|
||||
const char* SchedulerSourceCode20 = RUNTIME_KERNEL(
|
||||
\n
|
||||
extern void __amd_scheduler_pal(__global void*, __global void*, uint);
|
||||
\n
|
||||
@@ -41,4 +41,223 @@ extern void __amd_scheduler_pal(__global void*, __global void*, uint);
|
||||
}
|
||||
\n);
|
||||
|
||||
const char* TrapHandlerCode = RUNTIME_KERNEL(
|
||||
\n.set SQ_WAVE_PC_HI_ADDRESS_MASK , 0xFFFF
|
||||
\n.set SQ_WAVE_PC_HI_HT_SHIFT , 24
|
||||
\n.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT , 16
|
||||
\n.set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 8
|
||||
\n.set SQ_WAVE_PC_HI_TRAP_ID_BFE, (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16))
|
||||
\n.set SQ_WAVE_STATUS_HALT_SHIFT, 13
|
||||
\n.set SQ_WAVE_STATUS_HALT_BFE, (SQ_WAVE_STATUS_HALT_SHIFT | (1 << 16))
|
||||
\n.set SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 8
|
||||
\n.set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT , 11
|
||||
\n.set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT , 28
|
||||
\n.set SQ_WAVE_TRAPSTS_MATH_EXCP , 0x7F
|
||||
\n.set SQ_WAVE_MODE_EXCP_EN_SHIFT , 12
|
||||
\n.set TRAP_ID_ABORT , 2
|
||||
\n.set TRAP_ID_DEBUGTRAP , 3
|
||||
\n.set DOORBELL_ID_SIZE , 10
|
||||
\n.set DOORBELL_ID_MASK , ((1 << DOORBELL_ID_SIZE) - 1)
|
||||
\n.set EC_QUEUE_WAVE_ABORT_M0 , (1 << (DOORBELL_ID_SIZE + 0))
|
||||
\n.set EC_QUEUE_WAVE_TRAP_M0 , (1 << (DOORBELL_ID_SIZE + 1))
|
||||
\n.set EC_QUEUE_WAVE_MATH_ERROR_M0 , (1 << (DOORBELL_ID_SIZE + 2))
|
||||
\n.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 , (1 << (DOORBELL_ID_SIZE + 3))
|
||||
\n.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 4))
|
||||
\n.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 5))
|
||||
\n.set TTMP6_WAVE_STOPPED_SHIFT , 30
|
||||
\n.set TTMP6_SAVED_STATUS_HALT_SHIFT , 29
|
||||
\n.set TTMP6_SAVED_STATUS_HALT_MASK, (1 << TTMP6_SAVED_STATUS_HALT_SHIFT)
|
||||
\n.set TTMP6_SAVED_TRAP_ID_SHIFT, 25
|
||||
\n.set TTMP6_SAVED_TRAP_ID_SIZE, 4
|
||||
\n.set TTMP6_SAVED_TRAP_ID_MASK, (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT)
|
||||
\n.set TTMP6_SAVED_TRAP_ID_BFE, (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16))
|
||||
\n.set TTMP_PC_HI_SHIFT, 7
|
||||
\n.set TTMP_DEBUG_ENABLED_SHIFT, 23
|
||||
\n.if .amdgcn.gfx_generation_number == 9
|
||||
\n.set TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT, 26
|
||||
\n.set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, 15
|
||||
\n.set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK, 0x1F8000
|
||||
\n.elseif .amdgcn.gfx_generation_number == 10 &&.amdgcn.gfx_generation_minor < 3
|
||||
\n.set TTMP_SAVE_REPLAY_W64H_SHIFT, 31
|
||||
\n.set TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT, 24
|
||||
\n.set SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT, 25
|
||||
\n.set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, 15
|
||||
\n.set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK, 0x3F8000
|
||||
\n.set SQ_WAVE_IB_STS_REPLAY_W64H_MASK, 0x2000000
|
||||
\n.endif
|
||||
\n.if .amdgcn.gfx_generation_number == 9 &&.amdgcn.gfx_generation_minor >= 4
|
||||
\n.set TTMP11_TTMPS_SETUP_SHIFT, 31
|
||||
\n.endif
|
||||
// ABI between first and second level trap handler:
|
||||
// ttmp0 = PC[31:0]
|
||||
// ttmp12 = SQ_WAVE_STATUS
|
||||
// ttmp14 = TMA[31:0]
|
||||
// ttmp15 = TMA[63:32]
|
||||
// gfx9:
|
||||
// ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32]
|
||||
// gfx906/gfx908/gfx90a:
|
||||
// ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
|
||||
// gfx940/gfx941/gfx942:
|
||||
// ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0]
|
||||
// gfx10:
|
||||
// ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32]
|
||||
// gfx1010:
|
||||
// ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
|
||||
// gfx1030/gfx1100:
|
||||
// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
|
||||
\n .globl trap_entry
|
||||
\n .type trap_entry,@function
|
||||
\ntrap_entry:
|
||||
// Branch if not a trap (an exception instead).
|
||||
\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
|
||||
\n s_cbranch_scc0 .no_skip_debugtrap
|
||||
// If caused by s_trap then advance PC.
|
||||
\n s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
|
||||
\n s_cbranch_scc1 .not_s_trap
|
||||
\n s_add_u32 ttmp0, ttmp0, 0x4
|
||||
\n s_addc_u32 ttmp1, ttmp1, 0x0
|
||||
\n.not_s_trap:
|
||||
// If llvm.debugtrap and debugger is not attached.
|
||||
\n s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP
|
||||
\n s_cbranch_scc0 .no_skip_debugtrap
|
||||
\n.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) || .amdgcn.gfx_generation_number == 10
|
||||
\n s_bitcmp0_b32 ttmp11, TTMP_DEBUG_ENABLED_SHIFT
|
||||
\n.else
|
||||
\n s_bitcmp0_b32 ttmp13, TTMP_DEBUG_ENABLED_SHIFT
|
||||
\n.endif
|
||||
\n s_cbranch_scc0 .no_skip_debugtrap
|
||||
// Ignore llvm.debugtrap.
|
||||
\n s_branch .exit_trap
|
||||
\n.no_skip_debugtrap:
|
||||
// Save trap id and halt status in ttmp6.
|
||||
\n s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
|
||||
\n s_min_u32 ttmp2, ttmp2, 0xF
|
||||
\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT
|
||||
\n s_or_b32 ttmp6, ttmp6, ttmp2
|
||||
\n s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE
|
||||
\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT
|
||||
\n s_or_b32 ttmp6, ttmp6, ttmp2
|
||||
// Fetch doorbell id for our queue.
|
||||
\n.if .amdgcn.gfx_generation_number < 11
|
||||
\n s_mov_b32 ttmp2, exec_lo
|
||||
\n s_mov_b32 ttmp3, exec_hi
|
||||
\n s_mov_b32 exec_lo, 0x80000000
|
||||
\n s_sendmsg sendmsg(MSG_GET_DOORBELL)
|
||||
\n.wait_sendmsg:
|
||||
\n s_nop 0x7
|
||||
\n s_bitcmp0_b32 exec_lo, 0x1F
|
||||
\n s_cbranch_scc0 .wait_sendmsg
|
||||
\n s_mov_b32 exec_hi, ttmp3
|
||||
// Restore exec_lo, move the doorbell_id into ttmp3
|
||||
\n s_and_b32 ttmp3, exec_lo, DOORBELL_ID_MASK
|
||||
\n s_mov_b32 exec_lo, ttmp2
|
||||
\n.else
|
||||
\n s_sendmsg_rtn_b32 ttmp3, sendmsg(MSG_RTN_GET_DOORBELL)
|
||||
\n s_waitcnt lgkmcnt(0)
|
||||
\n s_and_b32 ttmp3, ttmp3, DOORBELL_ID_MASK
|
||||
\n.endif
|
||||
// Map trap reason to an exception code.
|
||||
\n s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS)
|
||||
\n
|
||||
\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT
|
||||
\n s_cbranch_scc0 .not_memory_violation
|
||||
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0
|
||||
// Aperture violation requires XNACK_ERROR == 0.
|
||||
\n s_branch .not_aperture_violation
|
||||
\n.not_memory_violation:
|
||||
\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT
|
||||
\n s_cbranch_scc0 .not_aperture_violation
|
||||
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0
|
||||
\n.not_aperture_violation:
|
||||
\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT
|
||||
\n s_cbranch_scc0 .not_illegal_instruction
|
||||
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0
|
||||
\n.not_illegal_instruction:
|
||||
\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_MATH_EXCP
|
||||
\n s_cbranch_scc0 .not_math_exception
|
||||
\n s_getreg_b32 ttmp7, hwreg(HW_REG_MODE)
|
||||
\n s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT
|
||||
\n s_and_b32 ttmp2, ttmp2, ttmp7
|
||||
\n s_cbranch_scc0 .not_math_exception
|
||||
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0
|
||||
\n.not_math_exception:
|
||||
\n s_bfe_u32 ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE
|
||||
\n s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT
|
||||
\n s_cbranch_scc0 .not_abort_trap
|
||||
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0
|
||||
\n.not_abort_trap:
|
||||
// If no other exception was flagged then report a generic error.
|
||||
\n s_andn2_b32 ttmp2, ttmp3, DOORBELL_ID_MASK
|
||||
\n s_cbranch_scc1 .send_interrupt
|
||||
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
|
||||
\n.send_interrupt:
|
||||
// m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id
|
||||
\n s_mov_b32 ttmp2, m0
|
||||
\n s_mov_b32 m0, ttmp3
|
||||
\n s_nop 0x0 // Manually inserted wait states
|
||||
\n s_sendmsg sendmsg(MSG_INTERRUPT)
|
||||
\n s_waitcnt lgkmcnt(0) // Wait for the message to go out.
|
||||
\n s_mov_b32 m0, ttmp2
|
||||
// Parking the wave requires saving the original pc in the preserved ttmps.
|
||||
// Register layout before parking the wave:
|
||||
//
|
||||
// ttmp7: 0[31:0]
|
||||
// ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0]
|
||||
//
|
||||
// After parking the wave:
|
||||
//
|
||||
// ttmp7: pc_lo[31:0]
|
||||
// ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0]
|
||||
\n.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) || (.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3)
|
||||
\n // Save the PC
|
||||
\n s_mov_b32 ttmp7, ttmp0
|
||||
\n s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK
|
||||
\n s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT
|
||||
\n s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT)
|
||||
\n s_or_b32 ttmp11, ttmp11, ttmp1
|
||||
// Park the wave
|
||||
\n s_getpc_b64 [ttmp0, ttmp1]
|
||||
\n s_add_u32 ttmp0, ttmp0, .parked - .
|
||||
\n s_addc_u32 ttmp1, ttmp1, 0x0
|
||||
\n.endif
|
||||
\n.halt_wave:
|
||||
// Halt the wavefront upon restoring STATUS below.
|
||||
\n s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT
|
||||
\n s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_HALT_SHIFT
|
||||
\n.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4)
|
||||
\n s_bitcmp1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT
|
||||
\n s_cbranch_scc1 .ttmps_initialized
|
||||
\n s_mov_b32 ttmp4, 0
|
||||
\n s_mov_b32 ttmp5, 0
|
||||
\n s_bitset1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT
|
||||
\n.ttmps_initialized:
|
||||
\n.endif
|
||||
\n.exit_trap:
|
||||
// Restore SQ_WAVE_IB_STS.
|
||||
\n.if .amdgcn.gfx_generation_number == 9
|
||||
\n.if .amdgcn.gfx_generation_minor < 4
|
||||
\n s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
|
||||
\n.else
|
||||
\n s_lshr_b32 ttmp2, ttmp13, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
|
||||
\n.endif
|
||||
\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
|
||||
\n s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
|
||||
\n.elseif .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3
|
||||
\n s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
|
||||
\n s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
|
||||
\n s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
|
||||
\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
|
||||
\n s_or_b32 ttmp2, ttmp2, ttmp3
|
||||
\n s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
|
||||
\n.endif
|
||||
// Restore SQ_WAVE_STATUS.
|
||||
\n s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
|
||||
\n s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
|
||||
\n s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12
|
||||
// Return to original (possibly modified) PC.
|
||||
\n s_rfe_b64 [ttmp0, ttmp1]
|
||||
\n.parked:
|
||||
\n s_trap 0x2
|
||||
\n s_branch .parked
|
||||
\n);
|
||||
} // namespace pal
|
||||
|
||||
@@ -816,6 +816,10 @@ Device::~Device() {
|
||||
// Destroy transfer queue
|
||||
delete xferQueue_;
|
||||
|
||||
if (trap_handler_ != nullptr) {
|
||||
trap_handler_->release();
|
||||
}
|
||||
|
||||
// Destroy blit program
|
||||
delete blitProgram_;
|
||||
|
||||
@@ -847,6 +851,7 @@ Device::~Device() {
|
||||
|
||||
extern const char* SchedulerSourceCode;
|
||||
extern const char* SchedulerSourceCode20;
|
||||
extern const char* TrapHandlerCode;
|
||||
|
||||
Pal::IDevice* gDeviceList[Pal::MaxDevices] = {};
|
||||
uint32_t gStartDevice = 0;
|
||||
@@ -1197,6 +1202,32 @@ bool Device::initializeHeapResources() {
|
||||
return false;
|
||||
}
|
||||
xferQueue_->enableSyncedBlit();
|
||||
// Setup trap handler if available
|
||||
if (trap_handler_ != nullptr) {
|
||||
auto program = reinterpret_cast<pal::LightningProgram*>(
|
||||
trap_handler_->getDeviceProgram(*this));
|
||||
if (program != nullptr) {
|
||||
Pal::Result result{Pal::Result::Success};
|
||||
Pal::GpuMemoryRef memRef = {};
|
||||
memRef.pGpuMemory = program->codeSegGpu().iMem();
|
||||
if (!settings().alwaysResident_) {
|
||||
// Make sure trap handler is always resident in memory
|
||||
// note: this code path is for OpenCL only, since HIP has alwaysResident_ enabled
|
||||
result = iDev()->AddGpuMemoryReferences(1, &memRef, nullptr, Pal::GpuMemoryRefCantTrim);
|
||||
}
|
||||
if (result == Pal::Result::Success) {
|
||||
// Find an offset in memory for the trap handler.
|
||||
// Loader returns an absolute address, but PAL accepts base + offset, hense find offset
|
||||
auto offset = program->GetTrapHandlerAddress() - memRef.pGpuMemory->Desc().gpuVirtAddr;
|
||||
#ifdef PAL_DEBUGGER
|
||||
// Bind trap handler to the kernel mode driver
|
||||
iDev()->BindTrapHandler(Pal::PipelineBindPoint::Compute, memRef.pGpuMemory, offset);
|
||||
#endif
|
||||
} else {
|
||||
LogError("Failed to make trap handler resident in memory");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -2552,6 +2583,29 @@ bool Device::createBlitProgram() {
|
||||
LogError("Couldn't create blit kernels!");
|
||||
result = false;
|
||||
}
|
||||
|
||||
#ifdef PAL_DEBUGGER
|
||||
if (settings().useLightning_) {
|
||||
const std::string TrapHandlerAsm = TrapHandlerCode;
|
||||
// Create a program for trap handler
|
||||
// note: It's not critical for runtime functionality to fail trap handler initialization
|
||||
trap_handler_ = new amd::Program(*context_, TrapHandlerAsm.c_str(), amd::Program::Assembly);
|
||||
if (trap_handler_ != nullptr) {
|
||||
std::vector<amd::Device*> devices;
|
||||
devices.push_back(this);
|
||||
std::string opt = "-cl-internal-kernel ";
|
||||
if (auto retval =
|
||||
trap_handler_->build(devices, opt.c_str(), nullptr, nullptr, false) != CL_SUCCESS) {
|
||||
DevLogPrintfError("Build failed for trap handler with error code: %d\n", retval);
|
||||
}
|
||||
if (!trap_handler_->load()) {
|
||||
DevLogPrintfError("Could not load the trap handler \n");
|
||||
}
|
||||
} else {
|
||||
DevLogPrintfError("Trap handler creation failed\n");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -713,6 +713,7 @@ class Device : public NullDevice {
|
||||
Pal::GpuMemoryHeapProperties
|
||||
heaps_[Pal::GpuHeapCount]; //!< Information about heaps, returned from PAL
|
||||
std::map<Pal::IQueue*, QueueRecycleInfo*> queue_pool_; //!< Pool of PAL queues for recycling
|
||||
amd::Program* trap_handler_ = nullptr; //!< Trap handler program for debugger setup
|
||||
};
|
||||
|
||||
/*@}*/ // namespace pal
|
||||
|
||||
@@ -278,7 +278,7 @@ bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWo
|
||||
|
||||
size_t kernelNamesSize = 0;
|
||||
acl_error errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_,
|
||||
RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize);
|
||||
RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
|
||||
return false;
|
||||
@@ -286,7 +286,7 @@ bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWo
|
||||
if (kernelNamesSize > 0) {
|
||||
std::vector<char> kernelNames(kernelNamesSize);
|
||||
errorCode = amd::Hsail::QueryInfo(palNullDevice().compiler(), binaryElf_, RT_KERNEL_NAMES,
|
||||
nullptr, kernelNames.data(), &kernelNamesSize);
|
||||
nullptr, kernelNames.data(), &kernelNamesSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
|
||||
return false;
|
||||
@@ -737,30 +737,34 @@ bool LightningProgram::createBinary(amd::option::Options* options) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
||||
bool internalKernel) {
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
// Find the size of global variables from the binary
|
||||
if (!FindGlobalVarSize(binary, binSize)) {
|
||||
buildLog_ += "Error: Cannot Find Global Var Sizes\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const auto& kernelMeta : kernelMetadataMap_) {
|
||||
auto kernelName = kernelMeta.first;
|
||||
auto kernel = new LightningKernel(kernelName, this, internalKernel);
|
||||
if (kernel == nullptr) {
|
||||
// Skip metadata look-up and kernel creation for assembly and internal kernel.
|
||||
// @note: Runtime compiles only the second level trap handler from assembly
|
||||
if ((owner()->language() != amd::Program::Assembly) || !internal_) {
|
||||
// Find the size of global variables from the binary
|
||||
if (!FindGlobalVarSize(binary, binSize)) {
|
||||
buildLog_ += "Error: Cannot Find Global Var Sizes\n";
|
||||
return false;
|
||||
}
|
||||
if (!kernel->init()) {
|
||||
buildLog_ += "[ROC][Kernel] Could not get Code Prop Meta Data \n";
|
||||
return false;
|
||||
|
||||
for (const auto& kernelMeta : kernelMetadataMap_) {
|
||||
auto kernelName = kernelMeta.first;
|
||||
auto kernel = new LightningKernel(kernelName, this, internalKernel);
|
||||
if (kernel == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if (!kernel->init()) {
|
||||
buildLog_ += "[ROC][Kernel] Could not get Code Prop Meta Data \n";
|
||||
return false;
|
||||
}
|
||||
kernels()[kernelName] = kernel;
|
||||
|
||||
kernel->setUniformWorkGroupSize(useUniformWorkGroupSize);
|
||||
}
|
||||
kernels()[kernelName] = kernel;
|
||||
|
||||
kernel->setUniformWorkGroupSize(useUniformWorkGroupSize);
|
||||
}
|
||||
|
||||
executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, nullptr);
|
||||
if (executable_ == nullptr) {
|
||||
LogError("Error: Executable for AMD HSA Code Object isn't created.");
|
||||
@@ -787,13 +791,14 @@ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUnifo
|
||||
return true;
|
||||
}
|
||||
|
||||
bool LightningProgram::setKernels(void* binary, size_t binSize,
|
||||
amd::Os::FileDesc fdesc, size_t foffset, std::string uri) {
|
||||
// ================================================================================================
|
||||
bool LightningProgram::setKernels(void* binary, size_t binSize, amd::Os::FileDesc fdesc,
|
||||
size_t foffset, std::string uri) {
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
// Collect the information about compiled binary
|
||||
if (!isNull() && (palDevice().rgpCaptureMgr() != nullptr)) {
|
||||
apiHash_ = palDevice().rgpCaptureMgr()->AddElfBinary(binary, binSize, binary, binSize,
|
||||
codeSegGpu_->iMem(), codeSegGpu_->offset());
|
||||
apiHash_ = palDevice().rgpCaptureMgr()->AddElfBinary(
|
||||
binary, binSize, binary, binSize, codeSegGpu_->iMem(), codeSegGpu_->offset());
|
||||
}
|
||||
|
||||
for (auto& kit : kernels()) {
|
||||
@@ -812,4 +817,15 @@ bool LightningProgram::setKernels(void* binary, size_t binSize,
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
uint64_t LightningProgram::GetTrapHandlerAddress() const {
|
||||
uint64_t address = 0;
|
||||
hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
|
||||
auto trap_sym = executable_->GetSymbol("trap_entry", &agent);
|
||||
if (trap_sym != nullptr) {
|
||||
trap_sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &address);
|
||||
}
|
||||
return address;
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
|
||||
@@ -269,6 +269,7 @@ class LightningProgram : public HSAILProgram {
|
||||
isHIP_ = (owner.language() == amd::Program::HIP);
|
||||
}
|
||||
virtual ~LightningProgram() {}
|
||||
uint64_t GetTrapHandlerAddress() const;
|
||||
|
||||
protected:
|
||||
virtual bool createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
|
||||
|
||||
Ссылка в новой задаче
Block a user