Create prebuild raslib package for RDC

Create a folder for prebuild raslib which contains the RAS binary
and configure files. The CMakeLists.txt is changed to include
those files.

Change-Id: I530198cff5686a19e58096c87457ab8b7c52d5f3
Этот коммит содержится в:
Bill(Shuzhou) Liu
2021-03-01 15:46:24 -05:00
родитель 5b4fbe08d2
Коммит 3aa95b210a
169 изменённых файлов: 25577 добавлений и 0 удалений
+11
Просмотреть файл
@@ -236,6 +236,17 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/example
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}
COMPONENT ${CLIENT_COMPONENT})
# Prebuild packages to install
install(FILES ${PROJECT_SOURCE_DIR}/ras_prebuild/librdc_ras.so
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}/lib
COMPONENT ${CLIENT_COMPONENT})
install(DIRECTORY ${PROJECT_SOURCE_DIR}/ras_prebuild/config
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}/lib
COMPONENT ${CLIENT_COMPONENT})
install(DIRECTORY ${PROJECT_SOURCE_DIR}/ras_prebuild/sp3
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}/lib
COMPONENT ${CLIENT_COMPONENT})
set(CPACK_PACKAGE_NAME ${RDC_PACKAGE})
set(CPACK_PACKAGE_VERSION ${VERSION_STRING})
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+26
Просмотреть файл
@@ -0,0 +1,26 @@
{
"version": "0.0.1",
"devices": [
{
"name": "VEGA20",
"ids": [ "0x66A0", "0x66A1", "0x66A2", "0x66A3", "0x66A4", "0x66A7", "0x66AF" ],
"config": "vega20.json",
"gfx": "libgfx9.so",
"sdma": "libsdma4.so"
},
{
"name": "ARCTURUS",
"ids": [ "0x738C", "0x7388", "0x738E" ],
"config": "arcturus.json",
"gfx": "libgfx9.so",
"sdma": "libsdma4.so"
},
{
"name": "SIENNA_CICHLID",
"ids": [ "0x73A0", "0x73A2", "0x73A3", "0x73AB", "0x73AE", "0x73BF" ],
"config": "sienna_cichlid.json",
"gfx": "libgfx10.so",
"sdma": "libsdma5.so"
}
]
}
+34
Просмотреть файл
@@ -0,0 +1,34 @@
{
"version": "0.0.1",
"type": {
"parity": 1,
"single_correctable": 2,
"multi_uncorrectable": 4,
"poison": 8
},
"block": {
"umc": {
"index": 0,
"support": 1,
"type": [
"single_correctable",
"multi_uncorrectable",
"poison"
]
}
},
"tests": [
{
"name": "ras_umc.0.2",
"block": "umc",
"type": "single_correctable",
"nullDispatchCS": "sp3/gfx10/edc/bin/sienna_cichlid/gc_edc_sqc_inst_bank_snop.bin"
},
{
"name": "ras_umc.0.4",
"block": "umc",
"type": "multi_uncorrectable",
"nullDispatchCS": "sp3/gfx10/edc/bin/sienna_cichlid/gc_edc_sqc_inst_bank_snop.bin"
}
]
}
Разница между файлами не показана из-за своего большого размера Загрузить разницу
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
+31
Просмотреть файл
@@ -0,0 +1,31 @@
shader main
asic(GFX10)
wave_size(32)
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
for var i = 0; i < 1000; i++
s_nop 0x1
end
s_endpgm
end
+42
Просмотреть файл
@@ -0,0 +1,42 @@
shader main
type(CS)
user_sgpr_count(0)
// Clear ACC VGPR
for var vgpr = 0; vgpr < 256; ++vgpr
v_accvgpr_write acc[vgpr], 0
end
s_movk_i32 m0, 0x0000
s_mov_b32 s10, 0x000000f8
s_set_gpr_idx_on s10, 0x8
label_0004:
v_mov_b32 v0, 0
v_mov_b32 v1, 0
v_mov_b32 v2, 0
v_mov_b32 v3, 0
v_mov_b32 v4, 0
v_mov_b32 v5, 0
v_mov_b32 v6, 0
v_mov_b32 v7, 0
s_sub_u32 s10, s10, 8
s_set_gpr_idx_idx s10
s_cbranch_scc0 label_0004
s_set_gpr_idx_off
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
v_mul_u32_u24 v1, 8, v1
s_getreg_b32 s11, hwreg(HW_REG_HW_ID, 4, 2)
s_mulk_i32 s11, 0x4000
v_add_co_u32 v1, vcc, v1, s11
s_mov_b32 s10, 7
s_mov_b32 m0, -1
label_001B:
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
v_add_co_u32 v1, vcc, 0x00000800, v1
s_sub_u32 s10, s10, 1
s_cbranch_scc0 label_001B
s_endpgm
end
+113
Просмотреть файл
@@ -0,0 +1,113 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
// Clear ACC VGPR
for var vgpr = 0; vgpr < 256; ++vgpr
v_accvgpr_write acc[vgpr], 0
end
//sp3 loop for lifetime
s_mov_b32 s12, 0 //init loop idx s12
label_0001:
s_cmp_lt_i32 s12, s8 //scc = (s12 < s8) ? 1 : 0
s_cbranch_scc0 label_0006 //if(scc == 0) then jump to label_0006; else nop
v_mov_b32 v4,s12
s_add_i32 s12, s12, 1 //add loop incr
s_branch label_0001
label_0006: //end of SP3 loop
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
s_load_dwordx4 s[40:43], s[0:1], 0x20
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
// Clear VGPR and LDS
s_movk_i32 m0, 0x0000
s_mov_b32 s12, 0x000000f8
s_set_gpr_idx_on s12, 0x8
label_0004:
v_mov_b32 v0, 0
v_mov_b32 v1, 0
v_mov_b32 v2, 0
v_mov_b32 v3, 0
v_mov_b32 v4, 0
v_mov_b32 v5, 0
v_mov_b32 v6, 0
v_mov_b32 v7, 0
s_sub_u32 s12, s12, 8
s_set_gpr_idx_idx s12
s_cbranch_scc0 label_0004
s_set_gpr_idx_off
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
v_mul_u32_u24 v1, 8, v1
s_getreg_b32 s13, hwreg(HW_REG_HW_ID, 4, 2)
s_mulk_i32 s13, 0x4000
v_add_co_u32 v1, vcc, v1, s13
s_mov_b32 s12, 7
s_mov_b32 m0, -1
label_001B:
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
v_add_co_u32 v1, vcc, 0x00000800, v1
s_sub_u32 s12, s12, 1
s_cbranch_scc0 label_001B
// Save coverage in the memory
s_getreg_b32 s20, hwreg(HW_REG_HW_ID, 0, 32)
// s12 = SIMD
s_lshr_b32 s12,s20,4
s_and_b32 s12, s12, 0x3
// s13 = CU
s_lshr_b32 s13,s20,8
s_and_b32 s13, s13, 0xf
// s14 = SE
s_lshr_b32 s14,s20,13
s_and_b32 s14, s14, 0x7
// s15 = SE * 16 * 4 + CU * 4 + SIMD
s_mul_i32 s16, s14, 64
s_mul_i32 s17, s13, 4
s_add_i32 s15, s16, s17
s_add_i32 s15, s15, s12
s_mul_i32 s16, s15, 4
s_buffer_store_dword s15, s24, s16 glc
s_waitcnt 0
s_buffer_load_dword s17, s24, s16 glc
s_waitcnt 0
s_endpgm
end
+59
Просмотреть файл
@@ -0,0 +1,59 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
v_mov_b32 v10, v0
//buffer_load_dword v10, v9, s24, s31 idxen:1 glc:1
//s_waitcnt 0
//v_mov_b32 v11, v1
s_nop 0x1
s_nop 0x1
s_nop 0x1
s_nop 0x1
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
+60
Просмотреть файл
@@ -0,0 +1,60 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read from the GDS
v_lshlrev_b32 v10, 2, v3
s_mov_b32 m0, 0xFFFF
s_nop 0x1
s_nop 0x1
s_nop 0x1
ds_read_b32 v11, v10 gds:1
s_waitcnt 0
v_mov_b32 v12, v11
s_nop 0x1
s_nop 0x1
s_nop 0x1
s_nop 0x1
s_endpgm
end
+673
Просмотреть файл
@@ -0,0 +1,673 @@
shader main
type(CS)
/*************************************************************************/
/* control on how to run the shader */
/*************************************************************************/
//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
var EMU_RUN_HACK = 1
var EMU_RUN_HACK_RESTORE_NORMAL = 0
var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
var SAVE_LDS = 0
var WG_BASE_ADDR_LO = 0x9000a000
var WG_BASE_ADDR_HI = 0x0
var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem
var CTX_SAVE_CONTROL = 0x0
var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
var SGPR_SAVE_USE_SQC = 0 //use SQC D$ to do the write
var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC)
var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
/**************************************************************************/
/* variables */
/**************************************************************************/
var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
/* Save */
var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
var S_SAVE_SPI_INIT_ATC_SHIFT = 27
var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
var s_save_spi_init_lo = exec_lo
var s_save_spi_init_hi = exec_hi
//tba_lo and tba_hi need to be saved/restored
var tba_lo = ttmp12
var tba_hi = ttmp13
var tma_lo = ttmp14
var tma_hi = ttmp15
var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
var s_save_pc_hi = ttmp1
var s_save_exec_lo = ttmp2
var s_save_exec_hi = ttmp3
var s_save_status = ttmp4
var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
var s_save_xnack_mask_lo = ttmp6
var s_save_xnack_mask_hi = ttmp7
var s_save_buf_rsrc0 = ttmp8
var s_save_buf_rsrc1 = ttmp9
var s_save_buf_rsrc2 = ttmp10
var s_save_buf_rsrc3 = ttmp11
var s_save_mem_offset = tma_lo
var s_save_alloc_size = s_save_trapsts //conflict
var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time)
var s_save_m0 = tma_hi
/* Restore */
var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
var s_restore_spi_init_lo = exec_lo
var s_restore_spi_init_hi = exec_hi
var s_restore_mem_offset = ttmp2
var s_restore_alloc_size = ttmp3
var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored
var s_restore_mem_offset_save = s_restore_tmp //no conflict
var s_restore_m0 = s_restore_alloc_size //no conflict
var s_restore_mode = ttmp7
var s_restore_pc_lo = ttmp0
var s_restore_pc_hi = ttmp1
var s_restore_exec_lo = tma_lo //no conflict
var s_restore_exec_hi = tma_hi //no conflict
var s_restore_status = ttmp4
var s_restore_trapsts = ttmp5
var s_restore_xnack_mask_lo = xnack_mask_lo
var s_restore_xnack_mask_hi = xnack_mask_hi
var s_restore_buf_rsrc0 = ttmp8
var s_restore_buf_rsrc1 = ttmp9
var s_restore_buf_rsrc2 = ttmp10
var s_restore_buf_rsrc3 = ttmp11
/**************************************************************************/
/* trap handler entry points */
/**************************************************************************/
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
//FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
//FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
else
s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
end
L_JUMP_TO_RESTORE:
s_branch L_RESTORE //restore
L_SKIP_RESTORE:
s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
s_cbranch_scc1 L_SAVE //this is the operation for save
//the poential code (such as restore STATUS) on this path is for regular trap handling and don't care for compute save & restore
//EMU will not execute the code since in hack mode it is skipped while in normal mode there is no save in EMU
//SIM will only execute the code in normal S/R mode but not in hack mode
if (!EMU_RUN_HACK)
L_ERROR: //to catch incorrect savectx setting in SIM assuming the trap handler is only used for save & restore
s_branch L_ERROR
end
/**************************************************************************/
/* save routine */
/**************************************************************************/
L_SAVE:
//check whether there is mem_viol
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
s_cbranch_scc0 L_NO_PC_REWIND
//if so, need rewind PC assuming GDS operation gets NACKed
s_mov_b32 s_save_tmp, 0 //clear mem_viol bit
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc
L_NO_PC_REWIND:
s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK
s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
/* inform SPI the readiness and wait for SPI's go signal */
s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
s_mov_b32 s_save_exec_hi, exec_hi
s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
if (EMU_RUN_HACK)
else
s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
end
L_SLEEP:
s_sleep 0x2
if (EMU_RUN_HACK)
else
s_cbranch_execz L_SLEEP
end
/* setup Resource Contants */
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
//calculate wd_addr using absolute thread id
v_readlane_b32 s_save_tmp, v9, 0
s_lshr_b32 s_save_tmp, s_save_tmp, 6
s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
else
end
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
else
end
s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
//FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?)
s_mov_b32 s_save_m0, m0 //save M0
/* global mem offset */
s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
/* the first wave in the threadgroup */
s_barrier //FIXME not performance-optimal "LDS is used? wait for other waves in the same TG"
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
s_cbranch_scc0 L_SAVE_VGPR
/* save LDS */
//////////////////////////////
L_SAVE_LDS:
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
s_mov_b32 exec_hi, 0xFFFFFFFF
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
s_cbranch_scc0 L_SAVE_VGPR //no lds used? jump to L_SAVE_VGPR
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 m0, 0x0 //lds_offset initial value = 0
L_SAVE_LDS_LOOP:
if (SAVE_LDS)
buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1
end
s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //mem offset increased by 256 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete?
/* save VGPRs */
//////////////////////////////
L_SAVE_VGPR:
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
s_mov_b32 exec_hi, 0xFFFFFFFF
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 m0, 0x0 //VGPR initial index value =0
s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later
L_SAVE_VGPR_LOOP:
v_mov_b32 v0, v0 //v0 = v[0+m0]
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
end
s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
s_set_gpr_idx_off
/* save SGPRs */
//////////////////////////////
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
if (SGPR_SAVE_USE_SQC)
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes
else
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
end
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 m0, 0x0 //SGPR initial index value =0
s_nop 0x0 //Manually inserted wait states
L_SAVE_SGPR_LOOP:
s_movrels_b32 s0, s0 //s0 = s[0+m0]
write_sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4
s_add_u32 m0, m0, 1 //next sgpr index
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete?
/* save HW registers */
//////////////////////////////
L_SAVE_HWREG:
s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
write_sgpr_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
end
write_sgpr_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC
write_sgpr_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC
write_sgpr_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS
//s_save_trapsts conflicts with s_save_alloc_size
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
write_sgpr_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS
write_sgpr_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO
write_sgpr_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI
//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
write_sgpr_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TBA_LO
write_sgpr_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TBA_HI
/* S_PGM_END_SAVED */ //FIXME graphics ONLY
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
s_rfe_b64 s_save_pc_lo //Return to the main shader program
else
end
s_branch L_END_PGM
/**************************************************************************/
/* restore routine */
/**************************************************************************/
L_RESTORE:
/* Setup Resource Contants */
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
//calculate wd_addr using absolute thread id
v_readlane_b32 s_restore_tmp, v9, 0
s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
else
end
s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
/* global mem offset */
s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
/* the first wave in the threadgroup */
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
s_cbranch_scc0 L_RESTORE_VGPR
/* restore LDS */
//////////////////////////////
L_RESTORE_LDS:
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
s_mov_b32 exec_hi, 0xFFFFFFFF
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 m0, 0x0 //lds_offset initial value = 0
L_RESTORE_LDS_LOOP:
if (SAVE_LDS)
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
end
s_add_u32 m0, m0, 256 //every buffer_load_dword does 256 bytes
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256 bytes
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?
/* restore VGPRs */
//////////////////////////////
L_RESTORE_VGPR:
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
s_mov_b32 exec_hi, 0xFFFFFFFF
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256
s_mov_b32 m0, 1 //VGPR initial index value = 1
s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
L_RESTORE_VGPR_LOOP:
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
end
s_waitcnt vmcnt(0) //ensure data ready
v_mov_b32 v0, v0 //v[0+m0] = v0
s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete?
s_set_gpr_idx_off
/* VGPR restore on v0 */
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
end
/* restore SGPRs */
//////////////////////////////
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
if (SGPR_SAVE_USE_SQC)
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes
else
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
end
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
read_sgpr_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp
s_mov_b32 m0, 0x1 //SGPR initial index value =1 //go on with with s1
L_RESTORE_SGPR_LOOP:
read_sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made
s_waitcnt lgkmcnt(0) //ensure data ready
s_movreld_b32 s0, s0 //s[0+m0] = s0
s_add_u32 m0, m0, 1 //next sgpr index
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete?
s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
/* restore HW registers */
//////////////////////////////
L_RESTORE_HWREG:
s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
read_sgpr_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
read_sgpr_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
read_sgpr_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
read_sgpr_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
read_sgpr_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
read_sgpr_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
read_sgpr_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
read_sgpr_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
read_sgpr_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TBA_LO
read_sgpr_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TBA_HI
s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
//for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
end
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
end
s_mov_b32 m0, s_restore_m0
s_mov_b32 exec_lo, s_restore_exec_lo
s_mov_b32 exec_hi, s_restore_exec_hi
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
//s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
//reuse s_restore_m0 as a temp register
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status
s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc
/**************************************************************************/
/* the END */
/**************************************************************************/
L_END_PGM:
s_endpgm
end
/**************************************************************************/
/* the helper functions */
/**************************************************************************/
function write_sgpr_to_mem(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
if (use_sqc)
s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
s_mov_b32 m0, s_mem_offset
s_buffer_store_dword s, s_rsrc, m0 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 4
s_mov_b32 m0, exec_lo
elsif (use_mtbuf)
v_mov_b32 v0, s
tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 256
else
v_mov_b32 v0, s
buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 256
end
end
function read_sgpr_from_mem(s, s_rsrc, s_mem_offset, use_sqc)
s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
if (use_sqc)
s_add_u32 s_mem_offset, s_mem_offset, 4
else
s_add_u32 s_mem_offset, s_mem_offset, 256
end
end
+21
Просмотреть файл
@@ -0,0 +1,21 @@
shader main
type(CS)
user_sgpr_count(4)
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v2, s2
v_mov_b32 v3, s3
flat_load_dword v4, v[0:1] slc
s_waitcnt vmcnt(0)&lgkmcnt(0)
v_mov_b32 v5, 0
s_sleep 40000
LOOP:
v_add_co_u32 v5, vcc, 1, v5
s_waitcnt vmcnt(0)&lgkmcnt(0)
v_cmp_lt_u32 vcc, v5, v4
s_cbranch_vccnz LOOP
flat_store_dword v[2,3], v5
s_waitcnt vmcnt(0)&lgkmcnt(0)
s_endpgm
end
+69
Просмотреть файл
@@ -0,0 +1,69 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x80
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x100
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x100
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
s_mov_b32 s31, 0x80
s_waitcnt 0
LOAD_LOOP:
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x100
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x100
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_endpgm
end
+131
Просмотреть файл
@@ -0,0 +1,131 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
s_waitcnt 0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
s_endpgm
end
+61
Просмотреть файл
@@ -0,0 +1,61 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
s_mov_b32 s32, 0x4000
v_mul_i32_i24 v9, v9, s32
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x10000
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 offen:1
s_waitcnt 0
s_add_u32 s31, s31, 0x10000
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
+79
Просмотреть файл
@@ -0,0 +1,79 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//write it to GDS
s_mov_b32 s30, s8
v_lshlrev_b32 v10, 2, v9
s_mov_b32 m0, 0xFFFF
s_nop 0x1
s_nop 0x1
s_nop 0x1
STORE_LOOP:
ds_write_b32 v10, v0 gds:1 // GPU hang when GPU access the GDS with GFX queue
s_waitcnt 0
v_add_u16 v10, v10, 0x10
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
v_lshlrev_b32 v10, 2, v3
LOAD_LOOP:
ds_read_b32 v11, v10 gds:1
s_waitcnt 0
v_mov_b32 v12, v11
v_add_u16 v10, v10, 0x10
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
+55
Просмотреть файл
@@ -0,0 +1,55 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read data from GDS
v_lshlrev_b32 v10, 2, v3
s_mov_b32 m0, 0xFFFF
s_nop 1
s_nop 1
s_nop 1
ds_read_b32 v11, v10 gds:1
s_waitcnt 0
//write the data to memory
buffer_store_dword v11, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
+68
Просмотреть файл
@@ -0,0 +1,68 @@
shader main
type(CS)
user_sgpr_count(4)
tgid_x_en(1)
tgid_y_en(1)
tgid_z_en(1)
s_getreg_b32 s18, hwreg(HW_REG_HW_ID, 0, 32)
s_bfe_u32 s16, s18, 0x2001e // get meid
s_bfe_u32 s17, s18, 0x20006 // get pipeid
//s_add_u32 s17, s17, s16
// get ring id
v_mov_b32 v20, s17
s_and_b32 s17, s17, 0x7
// Get thread_id inside wave
v_mbcnt_lo_u32_b32 v8, 0xffffffff, 0
v_mbcnt_hi_u32_b32 v9, 0xffffffff, v8
s_waitcnt 0
// init: gds write address
v_mov_b32 v13, 0
// the first 128DW is for ordered-append counter
v_mov_b32 v14, 0x80
// offset ring
v_mov_b32 v15, 0x200
v_mul_lo_u32 v15, v15, v20 // ring offset
v_mov_b32 v16, 0x40 // wave_size
v_mul_lo_u32 v18, v1, s1
v_add_co_u32 v18, vcc, v18, v0
v_lshrrev_b32 v17,6 ,v18
s_mov_b32 s9, s12
s_lshr_b32 s9, s9, 6
s_and_b32 s9, s9, 0x7ff
s_lshl_b32 s17, s17, 18
s_or_b32 s9, s9, s17
s_mov_b32 m0, s9
v_mov_b32 v10, 1
v_mov_b32 v11, 0
ds_ordered_count v11, v10 gds:1 offset0:0 offset1:1
s_waitcnt 0
v_mov_b32 v18, v11
v_mul_lo_u32 v16, v16, v18 // waves offset before.
v_add_co_u32 v13, vcc, v13, v14
v_add_co_u32 v13, vcc, v13, v15
v_add_co_u32 v13, vcc, v13, v16
v_add_co_u32 v13, vcc, v13, v9
v_lshlrev_b32 v13,2,v13
s_mov_b32 m0, 0x4000
s_nop 0
ds_write_b32 v13, v0 gds:1
s_waitcnt 0
s_endpgm
end
+79
Просмотреть файл
@@ -0,0 +1,79 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//store and load s8 times
s_mov_b32 s30, s8
v_lshlrev_b32 v10, 2, v3
s_mov_b32 m0, 0xFFFF
s_nop 0x1
s_nop 0x1
s_nop 0x1
STORE_LOOP:
ds_write_b32 v10, v0
s_waitcnt 0
v_add_u16 v10, v10, 0x10
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
v_lshlrev_b32 v10, 2, v3
LOAD_LOOP:
ds_read_b32 v11, v10
s_waitcnt 0
v_mov_b32 v12, v11
v_add_u16 v10, v10, 0x10
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
+55
Просмотреть файл
@@ -0,0 +1,55 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read it from LDS
v_lshlrev_b32 v10, 2, v3
s_mov_b32 m0, 0xFFFF
s_nop 1
s_nop 1
s_nop 1
ds_read_b32 v0, v10
s_waitcnt 0
//write the data to memory
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
+52
Просмотреть файл
@@ -0,0 +1,52 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//export poisoned data to L2
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
+77
Просмотреть файл
@@ -0,0 +1,77 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
//For vega20, we need to set bit 12 low. This bit will just be set low here in the shader.
//s_mov_b32 s24, 0x15c000
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//store it 10 times
v_mov_b32 v10, v0
v_mov_b32 v11, v0
v_mov_b32 v12, v0
v_mov_b32 v13, v0
v_mov_b32 v14, v0
v_mov_b32 v15, v0
v_mov_b32 v16, v0
v_mov_b32 v17, v0
v_mov_b32 v18, v0
v_mov_b32 v19, v0
// read them back
v_mov_b32 v29, v10
v_mov_b32 v28, v11
v_mov_b32 v27, v12
v_mov_b32 v26, v13
v_mov_b32 v25, v14
v_mov_b32 v24, v15
v_mov_b32 v23, v16
v_mov_b32 v22, v17
v_mov_b32 v21, v18
v_mov_b32 v20, v19
//export poisoned data to L2
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
+51
Просмотреть файл
@@ -0,0 +1,51 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//export poisoned data to L2
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
+55
Просмотреть файл
@@ -0,0 +1,55 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//SPI may touch s0...sn before shader is run
s_mov_b32 s16, s2
//write data
s_mov_b32 s30, s0
s_mov_b32 s31, s1
s_mov_b32 s32, s2
s_mov_b32 s33, s3
s_mov_b32 s34, s4
s_mov_b32 s35, s5
s_mov_b32 s36, s6
s_mov_b32 s37, s7
s_mov_b32 s38, s8
s_mov_b32 s39, s9
//read back
s_mov_b32 s0, s30
s_mov_b32 s1, s31
s_mov_b32 s2, s32
s_mov_b32 s3, s33
s_mov_b32 s4, s34
s_mov_b32 s5, s35
s_mov_b32 s6, s36
s_mov_b32 s7, s37
s_mov_b32 s8, s38
s_mov_b32 s9, s39
s_store_dword s16, s[0:1], 0x0 glc
s_endpgm
end
+75
Просмотреть файл
@@ -0,0 +1,75 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
var MTYPE_UC = 0x38000000
s_or_b32 s27, s27, MTYPE_UC
s_mov_b32 s30, s8
s_mov_b32 m0, 0x0
STORE_LOOP:
s_buffer_store_dword s8, s[20:23], m0 glc:1
s_waitcnt 0
s_add_u32 m0, m0, 4*1024 // step one 4KB page table address
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
var DEBUG_FUNCTION = 0
// Remove function check code to half shader run time...
if DEBUG_FUNCTION
s_mov_b32 s8, s30
s_mov_b32 m0, 0x0
LOAD_LOOP:
s_buffer_load_dword s0, s[20:23], m0 glc:1
s_waitcnt 0
s_add_u32 m0, m0, 4*1024
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
end
s_endpgm
end
+96
Просмотреть файл
@@ -0,0 +1,96 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
/*
s_bfe_u32 s33, s8, 0x20004 // extract bank select bits
s_lshl_b32 s33, s33, 6 // ((bank_sel & 0x3) << 6) , bank_sel = address[9:8] ^ address[7:6], if 4 bank enabled
s_and_b32 s8, s8, 0xf
*/
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
s_or_b32 s26, s26, 0x1000 //hack the buffer size to enough
STORE_LOOP:
var TOUCH_4_BANKS=1
if TOUCH_4_BANKS
s_mov_b32 m0, 0x0 // BANKA
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
s_mov_b32 m0, 0x40 // BANKB
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
/*
s_mov_b32 m0, 0x80 // BANKC
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
s_mov_b32 m0, 0xC0 // BANKD
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
*/
end
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_dcache_wb // to make emu, sim img match...
s_endpgm
end
+96
Просмотреть файл
@@ -0,0 +1,96 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
/*
s_bfe_u32 s33, s8, 0x20004 // extract bank select bits
s_lshl_b32 s33, s33, 6 // ((bank_sel & 0x3) << 6) , bank_sel = address[9:8] ^ address[7:6], if 4 bank enabled
s_and_b32 s8, s8, 0xf
*/
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
s_or_b32 s26, s26, 0x1000 //hack the buffer size to enough
STORE_LOOP:
var TOUCH_4_BANKS=1
if TOUCH_4_BANKS
s_mov_b32 m0, 0x0 // BANKA
s_buffer_store_dword s8, s[24:27], m0 glc:0
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:0
s_waitcnt 0
s_mov_b32 m0, 0x40 // BANKB
s_buffer_store_dword s8, s[24:27], m0 glc:0
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:0
s_waitcnt 0
/*
s_mov_b32 m0, 0x80 // BANKC
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
s_mov_b32 m0, 0xC0 // BANKD
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
*/
end
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_dcache_wb // to make emu, sim img match...
s_endpgm
end
+112
Просмотреть файл
@@ -0,0 +1,112 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
s_or_b32 s27, s27, 0x8000000 // changing mtype to non volatile
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
s_mov_b32 s9, 0xaa
s_mov_b32 s10, 0xbb
s_mov_b32 s11, 0xcc
// BUFFER STORE OFFSETS FOR BANK A AND BANKB
s_mov_b32 s12, 0x0
s_mov_b32 s13, 0x10
s_mov_b32 s14, 0x40
s_mov_b32 s15, 0x50
// The following sequence is needed to inject error in dirty bit ram. Sequence was provided by SQC designer 4/1/2015
//1. you have an invalid line in data cache,
//2. you write to some of the dwords in that line (the remaining dwords are still invalid),
//3. then there is a read request that hit on that line, but it needs the dwords that are not yet there in that line
//(in other words, it needs some of the invalid dwords of that line),
//4. the request will go to TC,
//5. when TC return comes back, the dirty bit rm will be read
STORE_LOOP:
var TOUCH_4_BANKS=1
if TOUCH_4_BANKS
s_mov_b32 m0, s13 // BANKA write one dword to tc
s_buffer_store_dwordx2 s[8:9], s[24:27], m0 glc:1
s_waitcnt 0
s_mov_b32 m0, s12 // BANKA. write one dword to sqc
s_buffer_store_dwordx2 s[10:11], s[24:27], m0 glc:0
s_waitcnt 0
s_mov_b32 m0, s13 // BANK A read the dword that is not in cache
s_buffer_load_dword s32, s[24:27], m0 glc:0
s_waitcnt 0
s_mov_b32 m0, s15 // BANKB write one dword to tc
s_buffer_store_dwordx2 s[8:9], s[24:27], m0 glc:1
s_waitcnt 0
s_mov_b32 m0, s14 // BANKB write one dword to sqc
s_buffer_store_dwordx2 s[10:11], s[24:27], m0 glc:0
s_waitcnt 0
s_mov_b32 m0, s15 // BANK B read the dword that is not in cache
s_buffer_load_dword s32, s[24:27], m0 glc:0
s_waitcnt 0
end
s_add_u32 s12, s12,0x80
s_add_u32 s13, s13,0x80
s_add_u32 s14, s14,0x80
s_add_u32 s15, s15,0x80
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_dcache_wb // to make emu, sim img match...
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
+63
Просмотреть файл
@@ -0,0 +1,63 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
var MTYPE_UC = 0x38000000
s_or_b32 s27, s27, MTYPE_UC
label inst_page[34+1] // 34 4k pages
for var i =0; i < 34; i++
inst_page[i]:
//each block are 4k side...
s_cbranch_execnz inst_page[i+1] //1 dword
for var j = 0; j < (4*1024)/4 -1; j++
v_mov_b32 v0, 0 // each with 1 dword
end
end
inst_page[34]:
s_endpgm
end
+69
Просмотреть файл
@@ -0,0 +1,69 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
// don't care about the loop count, fix 8 loops
// Totaly number of cacheline equals 2(A,B,)*8
var num_cache_lines = 16
label BLOCK_64B[num_cache_lines]
for var loop = 0; loop < num_cache_lines - 1; loop++
BLOCK_64B[loop]:
s_branch BLOCK_64B[loop+1] // 1DW
for var i = 0; i < 15; i++
v_nop
end
end
// last block
for var i = 0; i < 15; i++
v_nop
end
//For uei 2 msb and lsb flipped
// s_nop will become v_nop and it will a legal instruction
BLOCK_64B[num_cache_lines-1]:
for var i = 0; i < 81; i++
s_nop 0x1
end
s_endpgm
end
/** comment, four bank interleave
Addr 0x90000000 => Bank A
Addr 0x90000040 => Bank B
Addr 0x90000080 => Bank C
Addr 0x900000c0 => Bank D
Addr 0x90000100 => Bank B
Addr 0x90000140 => Bank A
Addr 0x90000180 => Bank D
Addr 0x900001c0 => Bank C
Addr 0x90000200 => Bank C
Addr 0x90000240 => Bank D
Addr 0x90000280 => Bank A
Addr 0x900002c0 => Bank B
Addr 0x90000300 => Bank D
Addr 0x90000340 => Bank C
Addr 0x90000380 => Bank B
**/
+29
Просмотреть файл
@@ -0,0 +1,29 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
for var i = 0; i < 1000; i++
s_nop 0x1
end
s_endpgm
end
+51
Просмотреть файл
@@ -0,0 +1,51 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read from memory
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//write the data to memory
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
+73
Просмотреть файл
@@ -0,0 +1,73 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
s_mov_b32 s16, 0xa5a50001
s_store_dword s16, s[0:1], 0x40 glc
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
s_mov_b32 s31, 0x0
LOAD_LOOP:
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
+71
Просмотреть файл
@@ -0,0 +1,71 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
//For vega20, we need to set bit 12 low to steer traffic to ea0
s_mov_b32 s32, 0xFFFFEFFF
s_and_b32 s24, s24, s32
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
s_mov_b32 s31, 0x0
LOAD_LOOP:
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_endpgm
end
+345
Просмотреть файл
@@ -0,0 +1,345 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
//set bit 12 low to select EA0
s_mov_b32 s32, 0xFFFFEFFF
s_and_b32 s24, s24, s32
s_and_b32 s31, s9, 0x1
s_cmpk_eq_i32 s31, 0x1
s_cbranch_scc1 ODD_WAVES
//set bit 12 high to select EA1
s_mov_b32 s32, 0x1000
s_or_b32 s24, s24, s32
ODD_WAVES:
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
v_mul_i32_i24 v3, v3, 0x10
v_mul_i32_i24 v9, v9, 0x10
s_mov_b32 s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
s_barrier
s_mov_b32 s31, 0xF0000
s_mov_b32 s32, 0x6000
//store and load s8 times
s_mov_b32 s30, s8
s_cmpk_lt_i32 s9, 0x2
s_cbranch_scc1 ATOMIC_LOOP
LOAD_LOOP:
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
s_sub_u32 s30, s30, 1
s_cmpk_eq_u32 s30, 0
s_cbranch_scc0 LOAD_LOOP
s_cmpk_ge_i32 s9, 0x2
s_cbranch_scc1 END
ATOMIC_LOOP:
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
s_sub_u32 s30, s30, 1
s_cmpk_eq_u32 s30, 0
s_cbranch_scc0 ATOMIC_LOOP
//s_waitcnt 0
END:
s_waitcnt 0
s_endpgm
end
+509
Просмотреть файл
@@ -0,0 +1,509 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
//set bit 12 low to select EA0
s_mov_b32 s32, 0xFFFFEFFF
s_and_b32 s24, s24, s32
s_and_b32 s31, s9, 0x1
s_cmpk_eq_i32 s31, 0x1
s_cbranch_scc1 ODD_WAVES
//set bit 12 high to select EA1
s_mov_b32 s32, 0x1000
s_or_b32 s24, s24, s32
ODD_WAVES:
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
v_mul_i32_i24 v3, v3, 0x10
v_mul_i32_i24 v9, v9, 0x10
s_mov_b32 s31, 0x9000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
s_barrier
s_mov_b32 s31, 0xF0000
s_mov_b32 s32, 0x9000
//store and load s8 times
s_mov_b32 s30, s8
s_cmpk_lt_i32 s9, 0x2
s_cbranch_scc1 ATOMIC_LOOP
s_mov_b32 s20, 0x1
LOAD_LOOP:
s_atomic_add s20, s0, 0x100000
s_atomic_add s20, s0, 0x100010
s_atomic_add s20, s0, 0x100020
s_atomic_add s20, s0, 0x100030
s_atomic_add s20, s0, 0x100040
s_atomic_add s20, s0, 0x100050
s_atomic_add s20, s0, 0x100060
s_atomic_add s20, s0, 0x100070
s_atomic_add s20, s0, 0x100080
s_atomic_add s20, s0, 0x100090
s_atomic_add s20, s0, 0x100100
s_atomic_add s20, s0, 0x100110
s_atomic_add s20, s0, 0x100120
s_atomic_add s20, s0, 0x100130
s_atomic_add s20, s0, 0x100140
s_atomic_add s20, s0, 0x100150
s_atomic_add s20, s0, 0x100160
s_atomic_add s20, s0, 0x100170
s_atomic_add s20, s0, 0x100180
s_atomic_add s20, s0, 0x100190
s_atomic_add s20, s0, 0x100200
s_atomic_add s20, s0, 0x100210
s_atomic_add s20, s0, 0x100220
s_atomic_add s20, s0, 0x100230
s_atomic_add s20, s0, 0x100240
s_atomic_add s20, s0, 0x100250
s_atomic_add s20, s0, 0x100260
s_atomic_add s20, s0, 0x100270
s_atomic_add s20, s0, 0x100280
s_atomic_add s20, s0, 0x100290
s_atomic_add s20, s0, 0x100300
s_atomic_add s20, s0, 0x100310
s_atomic_add s20, s0, 0x100320
s_atomic_add s20, s0, 0x100330
s_atomic_add s20, s0, 0x100340
s_atomic_add s20, s0, 0x100350
s_atomic_add s20, s0, 0x100360
s_atomic_add s20, s0, 0x100370
s_atomic_add s20, s0, 0x100380
s_atomic_add s20, s0, 0x100390
s_atomic_add s20, s0, 0x100400
s_atomic_add s20, s0, 0x100404
s_atomic_add s20, s0, 0x100408
s_atomic_add s20, s0, 0x10040c
s_atomic_add s20, s0, 0x100410
s_atomic_add s20, s0, 0x100414
s_atomic_add s20, s0, 0x100418
s_atomic_add s20, s0, 0x10041c
s_atomic_add s20, s0, 0x100420
s_atomic_add s20, s0, 0x100424
s_atomic_add s20, s0, 0x100428
s_atomic_add s20, s0, 0x10042c
s_atomic_add s20, s0, 0x100500
s_atomic_add s20, s0, 0x100504
s_atomic_add s20, s0, 0x100508
s_atomic_add s20, s0, 0x10050c
s_atomic_add s20, s0, 0x100510
s_atomic_add s20, s0, 0x100514
s_atomic_add s20, s0, 0x100518
s_atomic_add s20, s0, 0x10051c
s_atomic_add s20, s0, 0x100520
s_atomic_add s20, s0, 0x100524
s_atomic_add s20, s0, 0x100528
s_atomic_add s20, s0, 0x10052c
s_atomic_add s20, s0, 0x100600
s_atomic_add s20, s0, 0x100604
s_atomic_add s20, s0, 0x100608
s_atomic_add s20, s0, 0x10060c
s_atomic_add s20, s0, 0x100610
s_atomic_add s20, s0, 0x100614
s_atomic_add s20, s0, 0x100618
s_atomic_add s20, s0, 0x10061c
s_atomic_add s20, s0, 0x100620
s_atomic_add s20, s0, 0x100624
s_atomic_add s20, s0, 0x100628
s_atomic_add s20, s0, 0x10062c
s_atomic_add s20, s0, 0x100700
s_atomic_add s20, s0, 0x100704
s_atomic_add s20, s0, 0x100708
s_atomic_add s20, s0, 0x10070c
s_atomic_add s20, s0, 0x100710
s_atomic_add s20, s0, 0x100714
s_atomic_add s20, s0, 0x100718
s_atomic_add s20, s0, 0x10071c
s_atomic_add s20, s0, 0x100720
s_atomic_add s20, s0, 0x100724
s_atomic_add s20, s0, 0x100728
s_atomic_add s20, s0, 0x10072c
s_atomic_add s20, s0, 0x100800
s_atomic_add s20, s0, 0x100804
s_atomic_add s20, s0, 0x100808
s_atomic_add s20, s0, 0x10080c
s_atomic_add s20, s0, 0x100810
s_atomic_add s20, s0, 0x100814
s_atomic_add s20, s0, 0x100818
s_atomic_add s20, s0, 0x10081c
s_atomic_add s20, s0, 0x100820
s_atomic_add s20, s0, 0x100824
s_atomic_add s20, s0, 0x100828
s_atomic_add s20, s0, 0x10082c
s_atomic_add s20, s0, 0x100900
s_atomic_add s20, s0, 0x100904
s_atomic_add s20, s0, 0x100908
s_atomic_add s20, s0, 0x10090c
s_atomic_add s20, s0, 0x100910
s_atomic_add s20, s0, 0x100914
s_atomic_add s20, s0, 0x100918
s_atomic_add s20, s0, 0x10091c
s_atomic_add s20, s0, 0x100920
s_atomic_add s20, s0, 0x100924
s_atomic_add s20, s0, 0x100928
s_atomic_add s20, s0, 0x10092c
s_atomic_add s20, s0, 0x100a00
s_atomic_add s20, s0, 0x100a04
s_atomic_add s20, s0, 0x100a08
s_atomic_add s20, s0, 0x100a0c
s_atomic_add s20, s0, 0x100a10
s_atomic_add s20, s0, 0x100a14
s_atomic_add s20, s0, 0x100a18
s_atomic_add s20, s0, 0x100a1c
s_atomic_add s20, s0, 0x100a20
s_atomic_add s20, s0, 0x100a24
s_atomic_add s20, s0, 0x100a28
s_atomic_add s20, s0, 0x100a2c
s_atomic_add s20, s0, 0x100b00
s_atomic_add s20, s0, 0x100b04
s_atomic_add s20, s0, 0x100b08
s_atomic_add s20, s0, 0x100b0c
s_atomic_add s20, s0, 0x100b10
s_atomic_add s20, s0, 0x100b14
s_atomic_add s20, s0, 0x100b18
s_atomic_add s20, s0, 0x100b1c
s_atomic_add s20, s0, 0x100b20
s_atomic_add s20, s0, 0x100b24
s_atomic_add s20, s0, 0x100b28
s_atomic_add s20, s0, 0x100b2c
s_atomic_add s20, s0, 0x100c00
s_atomic_add s20, s0, 0x100c04
s_atomic_add s20, s0, 0x100c08
s_atomic_add s20, s0, 0x100c0c
s_atomic_add s20, s0, 0x100c10
s_atomic_add s20, s0, 0x100c14
s_atomic_add s20, s0, 0x100c18
s_atomic_add s20, s0, 0x100c1c
s_atomic_add s20, s0, 0x100c20
s_atomic_add s20, s0, 0x100c24
s_atomic_add s20, s0, 0x100c28
s_atomic_add s20, s0, 0x100c2c
s_sub_u32 s30, s30, 1
s_cmpk_eq_u32 s30, 0
s_cbranch_scc0 LOAD_LOOP
s_cmpk_ge_i32 s9, 0x2
s_cbranch_scc1 END
ATOMIC_LOOP:
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
s_sub_u32 s30, s30, 1
s_cmpk_eq_u32 s30, 0
s_cbranch_scc0 ATOMIC_LOOP
//s_waitcnt 0
END:
s_waitcnt 0
s_endpgm
end
Разница между файлами не показана из-за своего большого размера Загрузить разницу
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+80
Просмотреть файл
@@ -0,0 +1,80 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
var MTYPE_UC = 0x38000000
s_or_b32 s27, s27, MTYPE_UC
//store and load s8 times
s_mov_b32 s8, 33 // store 33 times to overflow atcl1 cache...
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
STORE_LOOP:
v_add_co_u32 v0, vcc[0:1], v0, 2
buffer_store_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
s_waitcnt 0
s_add_u32 s31, s31, 4*1024 // step one 4KB page size
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
var DEBUG_FUNCTION = 0
//remove code to half shader run time
if DEBUG_FUNCTION
s_mov_b32 s8, s30
s_mov_b32 s31, 0x0
LOAD_LOOP:
buffer_load_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 4*1024
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
end
s_endpgm
end
+80
Просмотреть файл
@@ -0,0 +1,80 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
var MTYPE_UC = 0x38000000
s_or_b32 s27, s27, MTYPE_UC
//store and load s8 times
s_mov_b32 s8, 33 // store 33 times to overflow atcl1 cache...
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
STORE_LOOP:
v_add_co_u32 v0, vcc[0:1], v0, 2
buffer_store_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
s_waitcnt 0
s_add_u32 s31, s31, 4*1024 // step one 4KB page size
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
var DEBUG_FUNCTION = 1
//remove code to half shader run time
if DEBUG_FUNCTION
s_mov_b32 s8, 0x20
s_mov_b32 s31, 0xffc
LOAD_LOOP:
buffer_load_dwordx2 v[0:1], v9, s20, s31 idxen:1 glc:1 slc:1
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 4*1024
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
end
s_endpgm
end
+72
Просмотреть файл
@@ -0,0 +1,72 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//bump up the addresses being accessed to generate multiple reads to the pde memories
v_mul_u32_u24 v9, 65536, v9
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
//Hack number of records to avoid range checking which we don't want since we want to generate
//out of range accesses. we are really trying to generate many reads to the PDEs to get FUE.
s_mov_b32 s26, 0xffffffff
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
s_mov_b32 s31, 0x0
LOAD_LOOP:
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_endpgm
end
+72
Просмотреть файл
@@ -0,0 +1,72 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//bump up the addresses being accessed to generate multiple reads to the pde memories
v_mul_u32_u24 v9, 4096, v9
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
//Hack number of records to avoid range checking which we don't want since we want to generate
//out of range accesses. we are really trying to generate many reads to the PDEs to get FUE.
s_mov_b32 s26, 0xffffffff
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
s_mov_b32 s31, 0x0
LOAD_LOOP:
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_endpgm
end
+47
Просмотреть файл
@@ -0,0 +1,47 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
+54
Просмотреть файл
@@ -0,0 +1,54 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_endpgm
end
+54
Просмотреть файл
@@ -0,0 +1,54 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
s_mov_b32 s16, s2
//SPI may touch v0,v1,v2 before shader is run
//store it 10 times
v_mov_b32 v10, v1
v_mov_b32 v11, v2
v_mov_b32 v12, v1
v_mov_b32 v13, v2
v_mov_b32 v14, v1
v_mov_b32 v15, v2
v_mov_b32 v16, v1
v_mov_b32 v17, v2
v_mov_b32 v18, v1
v_mov_b32 v19, v0
// read them back
v_mov_b32 v29, v10
v_mov_b32 v28, v11
v_mov_b32 v27, v12
v_mov_b32 v26, v13
v_mov_b32 v25, v14
v_mov_b32 v24, v15
v_mov_b32 v23, v16
v_mov_b32 v22, v17
v_mov_b32 v21, v18
v_mov_b32 v20, v19
s_store_dword s16, s[0:1], 0x0 glc
s_endpgm
end
+75
Просмотреть файл
@@ -0,0 +1,75 @@
shader main
type(CS)
user_sgpr_count(2) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
tgid_x_en(1) //s_tgid_x s2
tgid_y_en(1) //s_tgid_y s3
tgid_z_en(1) //s_tgid_z s4
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
for var vgpr = 0; vgpr < 256; ++vgpr
v_accvgpr_read v[vgpr], acc[vgpr]
end
for var vgpr = 0; vgpr < 256; ++vgpr
v_accvgpr_write acc[vgpr], v[vgpr]
end
s_movk_i32 m0, 0x0000
s_mov_b32 s10, 0x000000f8
s_set_gpr_idx_on s10, 0x8
label_0004:
v_mov_b32 v0, 0
v_mov_b32 v1, 0
v_mov_b32 v2, 0
v_mov_b32 v3, 0
v_mov_b32 v4, 0
v_mov_b32 v5, 0
v_mov_b32 v6, 0
v_mov_b32 v7, 0
s_sub_u32 s10, s10, 8
s_set_gpr_idx_idx s10
s_cbranch_scc0 label_0004
s_set_gpr_idx_off
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
v_mul_u32_u24 v1, 8, v1
s_getreg_b32 s11, hwreg(HW_REG_HW_ID, 4, 2)
s_mulk_i32 s11, 0x4000
v_add_co_u32 v1, vcc, v1, s11
s_mov_b32 s10, 7
s_mov_b32 m0, -1
label_001B:
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
v_add_co_u32 v1, vcc, 0x00000800, v1
s_sub_u32 s10, s10, 1
s_cbranch_scc0 label_001B
s_getreg_b32 s20, hwreg(HW_REG_HW_ID, 0, 32)
// s12 = SIMD
s_lshr_b32 s12,s20,4
s_and_b32 s12, s12, 0x3
// s13 = CU
s_lshr_b32 s13,s20,8
s_and_b32 s13, s13, 0xf
// s14 = SE
s_lshr_b32 s14,s20,13
s_and_b32 s14, s14, 0x7
// s15 = SE * 16 * 4 + CU * 4 + SIMD
s_mul_i32 s16, s14, 64
s_mul_i32 s17, s13, 4
s_add_i32 s15, s16, s17
s_add_i32 s15, s15, s12
s_mul_i32 s16, s15, 4
s_store_dword s15, s[0:1], s16 glc
s_waitcnt 0
s_endpgm
end
+58
Просмотреть файл
@@ -0,0 +1,58 @@
//s[0:1]: buffer resource
//s2: num_threads_x_full
//s3: num_threads_x_full * num_threads_y_full
//s4: num_threads_x_full * num_threads_y_full * num_threads_z_full
//s5: COMPUTE_DIM_X
//s6: COMPUTE_DIM_X * COMPUTE_DIM_Y
//s7: loop_lifetime
//s8: dispatch_offset
//s[9:11]: thread group ID
//v[0:2]: thread ID
shader main
type(CS)
user_sgpr_count(9)
tgid_x_en(1)
tgid_y_en(1)
tgid_z_en(1)
//sp3 loop for lifetime
s_mov_b32 s12, 0 //init loop idx s12
label_0004:
s_cmp_lt_i32 s12, s7 //scc = (s12 < s7) ? 1 : 0
s_cbranch_scc0 label_0006 //if(scc == 0) then jump to label_0006; else nop
v_mov_b32 v4,s12
s_add_i32 s12, s12, 1 //add loop incr
s_branch label_0004
label_0006: //end of SP3 loop
//v3 thread_id_in_group = (tid_z * num_threads_x_full * num_threads_y_full) + (tid_y * num_threads_x_full) + tid_x
v_mad_u32_u24 v3, v1, s2, v0 //v3 = tid_y * num_threads_x_full + tid_x
v_mad_u32_u24 v3, v2, s3, v3 //v3 = tid_z * num_threads_x_ful * num_threads_y_full + v3
//s28 thread_group_id = (tgid_z * COMPUTE_DIM_X * COMPUTE_DIM_Y) + (tgid_y * COMPUTE_DIM_X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5 //tgid_y * COMPUTE_DIM_X
s_add_i32 s28, s28, s_tgid_x //tgid_y * COMPUTE_DIM_X + tgid_x
s_mul_i32 s29, s6, s_tgid_z //tgid_z * COMPUTE_DIM_X * COMPUTE_DIM_Y
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id * (num_threads_x_full * num_threads_y_full * num_threads_z_full) + thread_id_in_group
v_mov_b32 v9, s28 //thread_group_id
v_mad_u32_u24 v9, v9, s4, v3
//fetch the buffer resource
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
//write absolute thread id using it as an index
buffer_store_dword v9, v9, s24, s8 idxen:1
s_waitcnt 0
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Просмотреть файл
Просмотреть файл
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше