Create prebuild raslib package for RDC

Create a folder for prebuild raslib which contains the RAS binary
and configure files. The CMakeLists.txt is changed to include
those files.

Change-Id: I530198cff5686a19e58096c87457ab8b7c52d5f3


[ROCm/rdc commit: 3aa95b210a]
This commit is contained in:
Bill(Shuzhou) Liu
2021-03-01 15:46:24 -05:00
parent 114470e450
commit a3398b4751
169 ha cambiato i file con 25577 aggiunte e 0 eliminazioni
+11
Vedi File
@@ -236,6 +236,17 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/example
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}
COMPONENT ${CLIENT_COMPONENT})
# Prebuild packages to install
install(FILES ${PROJECT_SOURCE_DIR}/ras_prebuild/librdc_ras.so
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}/lib
COMPONENT ${CLIENT_COMPONENT})
install(DIRECTORY ${PROJECT_SOURCE_DIR}/ras_prebuild/config
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}/lib
COMPONENT ${CLIENT_COMPONENT})
install(DIRECTORY ${PROJECT_SOURCE_DIR}/ras_prebuild/sp3
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}/lib
COMPONENT ${CLIENT_COMPONENT})
set(CPACK_PACKAGE_NAME ${RDC_PACKAGE})
set(CPACK_PACKAGE_VERSION ${VERSION_STRING})
File diff soppresso perché troppo grande Carica Diff
@@ -0,0 +1,26 @@
{
"version": "0.0.1",
"devices": [
{
"name": "VEGA20",
"ids": [ "0x66A0", "0x66A1", "0x66A2", "0x66A3", "0x66A4", "0x66A7", "0x66AF" ],
"config": "vega20.json",
"gfx": "libgfx9.so",
"sdma": "libsdma4.so"
},
{
"name": "ARCTURUS",
"ids": [ "0x738C", "0x7388", "0x738E" ],
"config": "arcturus.json",
"gfx": "libgfx9.so",
"sdma": "libsdma4.so"
},
{
"name": "SIENNA_CICHLID",
"ids": [ "0x73A0", "0x73A2", "0x73A3", "0x73AB", "0x73AE", "0x73BF" ],
"config": "sienna_cichlid.json",
"gfx": "libgfx10.so",
"sdma": "libsdma5.so"
}
]
}
@@ -0,0 +1,34 @@
{
"version": "0.0.1",
"type": {
"parity": 1,
"single_correctable": 2,
"multi_uncorrectable": 4,
"poison": 8
},
"block": {
"umc": {
"index": 0,
"support": 1,
"type": [
"single_correctable",
"multi_uncorrectable",
"poison"
]
}
},
"tests": [
{
"name": "ras_umc.0.2",
"block": "umc",
"type": "single_correctable",
"nullDispatchCS": "sp3/gfx10/edc/bin/sienna_cichlid/gc_edc_sqc_inst_bank_snop.bin"
},
{
"name": "ras_umc.0.4",
"block": "umc",
"type": "multi_uncorrectable",
"nullDispatchCS": "sp3/gfx10/edc/bin/sienna_cichlid/gc_edc_sqc_inst_bank_snop.bin"
}
]
}
File diff soppresso perché troppo grande Carica Diff
File binario non mostrato.
@@ -0,0 +1,31 @@
shader main
asic(GFX10)
wave_size(32)
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
for var i = 0; i < 1000; i++
s_nop 0x1
end
s_endpgm
end
@@ -0,0 +1,42 @@
shader main
type(CS)
user_sgpr_count(0)
// Clear ACC VGPR
for var vgpr = 0; vgpr < 256; ++vgpr
v_accvgpr_write acc[vgpr], 0
end
s_movk_i32 m0, 0x0000
s_mov_b32 s10, 0x000000f8
s_set_gpr_idx_on s10, 0x8
label_0004:
v_mov_b32 v0, 0
v_mov_b32 v1, 0
v_mov_b32 v2, 0
v_mov_b32 v3, 0
v_mov_b32 v4, 0
v_mov_b32 v5, 0
v_mov_b32 v6, 0
v_mov_b32 v7, 0
s_sub_u32 s10, s10, 8
s_set_gpr_idx_idx s10
s_cbranch_scc0 label_0004
s_set_gpr_idx_off
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
v_mul_u32_u24 v1, 8, v1
s_getreg_b32 s11, hwreg(HW_REG_HW_ID, 4, 2)
s_mulk_i32 s11, 0x4000
v_add_co_u32 v1, vcc, v1, s11
s_mov_b32 s10, 7
s_mov_b32 m0, -1
label_001B:
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
v_add_co_u32 v1, vcc, 0x00000800, v1
s_sub_u32 s10, s10, 1
s_cbranch_scc0 label_001B
s_endpgm
end
@@ -0,0 +1,113 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
// Clear ACC VGPR
for var vgpr = 0; vgpr < 256; ++vgpr
v_accvgpr_write acc[vgpr], 0
end
//sp3 loop for lifetime
s_mov_b32 s12, 0 //init loop idx s12
label_0001:
s_cmp_lt_i32 s12, s8 //scc = (s12 < s8) ? 1 : 0
s_cbranch_scc0 label_0006 //if(scc == 0) then jump to label_0006; else nop
v_mov_b32 v4,s12
s_add_i32 s12, s12, 1 //add loop incr
s_branch label_0001
label_0006: //end of SP3 loop
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
s_load_dwordx4 s[40:43], s[0:1], 0x20
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
// Clear VGPR and LDS
s_movk_i32 m0, 0x0000
s_mov_b32 s12, 0x000000f8
s_set_gpr_idx_on s12, 0x8
label_0004:
v_mov_b32 v0, 0
v_mov_b32 v1, 0
v_mov_b32 v2, 0
v_mov_b32 v3, 0
v_mov_b32 v4, 0
v_mov_b32 v5, 0
v_mov_b32 v6, 0
v_mov_b32 v7, 0
s_sub_u32 s12, s12, 8
s_set_gpr_idx_idx s12
s_cbranch_scc0 label_0004
s_set_gpr_idx_off
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
v_mul_u32_u24 v1, 8, v1
s_getreg_b32 s13, hwreg(HW_REG_HW_ID, 4, 2)
s_mulk_i32 s13, 0x4000
v_add_co_u32 v1, vcc, v1, s13
s_mov_b32 s12, 7
s_mov_b32 m0, -1
label_001B:
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
v_add_co_u32 v1, vcc, 0x00000800, v1
s_sub_u32 s12, s12, 1
s_cbranch_scc0 label_001B
// Save coverage in the memory
s_getreg_b32 s20, hwreg(HW_REG_HW_ID, 0, 32)
// s12 = SIMD
s_lshr_b32 s12,s20,4
s_and_b32 s12, s12, 0x3
// s13 = CU
s_lshr_b32 s13,s20,8
s_and_b32 s13, s13, 0xf
// s14 = SE
s_lshr_b32 s14,s20,13
s_and_b32 s14, s14, 0x7
// s15 = SE * 16 * 4 + CU * 4 + SIMD
s_mul_i32 s16, s14, 64
s_mul_i32 s17, s13, 4
s_add_i32 s15, s16, s17
s_add_i32 s15, s15, s12
s_mul_i32 s16, s15, 4
s_buffer_store_dword s15, s24, s16 glc
s_waitcnt 0
s_buffer_load_dword s17, s24, s16 glc
s_waitcnt 0
s_endpgm
end
@@ -0,0 +1,59 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
v_mov_b32 v10, v0
//buffer_load_dword v10, v9, s24, s31 idxen:1 glc:1
//s_waitcnt 0
//v_mov_b32 v11, v1
s_nop 0x1
s_nop 0x1
s_nop 0x1
s_nop 0x1
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
@@ -0,0 +1,60 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read from the GDS
v_lshlrev_b32 v10, 2, v3
s_mov_b32 m0, 0xFFFF
s_nop 0x1
s_nop 0x1
s_nop 0x1
ds_read_b32 v11, v10 gds:1
s_waitcnt 0
v_mov_b32 v12, v11
s_nop 0x1
s_nop 0x1
s_nop 0x1
s_nop 0x1
s_endpgm
end
@@ -0,0 +1,673 @@
shader main
type(CS)
/*************************************************************************/
/* control on how to run the shader */
/*************************************************************************/
//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
var EMU_RUN_HACK = 1
var EMU_RUN_HACK_RESTORE_NORMAL = 0
var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
var SAVE_LDS = 0
var WG_BASE_ADDR_LO = 0x9000a000
var WG_BASE_ADDR_HI = 0x0
var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem
var CTX_SAVE_CONTROL = 0x0
var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
var SGPR_SAVE_USE_SQC = 0 //use SQC D$ to do the write
var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC)
var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
/**************************************************************************/
/* variables */
/**************************************************************************/
var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
/* Save */
var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
var S_SAVE_SPI_INIT_ATC_SHIFT = 27
var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
var s_save_spi_init_lo = exec_lo
var s_save_spi_init_hi = exec_hi
//tba_lo and tba_hi need to be saved/restored
var tba_lo = ttmp12
var tba_hi = ttmp13
var tma_lo = ttmp14
var tma_hi = ttmp15
var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
var s_save_pc_hi = ttmp1
var s_save_exec_lo = ttmp2
var s_save_exec_hi = ttmp3
var s_save_status = ttmp4
var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
var s_save_xnack_mask_lo = ttmp6
var s_save_xnack_mask_hi = ttmp7
var s_save_buf_rsrc0 = ttmp8
var s_save_buf_rsrc1 = ttmp9
var s_save_buf_rsrc2 = ttmp10
var s_save_buf_rsrc3 = ttmp11
var s_save_mem_offset = tma_lo
var s_save_alloc_size = s_save_trapsts //conflict
var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time)
var s_save_m0 = tma_hi
/* Restore */
var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
var s_restore_spi_init_lo = exec_lo
var s_restore_spi_init_hi = exec_hi
var s_restore_mem_offset = ttmp2
var s_restore_alloc_size = ttmp3
var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored
var s_restore_mem_offset_save = s_restore_tmp //no conflict
var s_restore_m0 = s_restore_alloc_size //no conflict
var s_restore_mode = ttmp7
var s_restore_pc_lo = ttmp0
var s_restore_pc_hi = ttmp1
var s_restore_exec_lo = tma_lo //no conflict
var s_restore_exec_hi = tma_hi //no conflict
var s_restore_status = ttmp4
var s_restore_trapsts = ttmp5
var s_restore_xnack_mask_lo = xnack_mask_lo
var s_restore_xnack_mask_hi = xnack_mask_hi
var s_restore_buf_rsrc0 = ttmp8
var s_restore_buf_rsrc1 = ttmp9
var s_restore_buf_rsrc2 = ttmp10
var s_restore_buf_rsrc3 = ttmp11
/**************************************************************************/
/* trap handler entry points */
/**************************************************************************/
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
//FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
//FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
else
s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
end
L_JUMP_TO_RESTORE:
s_branch L_RESTORE //restore
L_SKIP_RESTORE:
s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
s_cbranch_scc1 L_SAVE //this is the operation for save
//the poential code (such as restore STATUS) on this path is for regular trap handling and don't care for compute save & restore
//EMU will not execute the code since in hack mode it is skipped while in normal mode there is no save in EMU
//SIM will only execute the code in normal S/R mode but not in hack mode
if (!EMU_RUN_HACK)
L_ERROR: //to catch incorrect savectx setting in SIM assuming the trap handler is only used for save & restore
s_branch L_ERROR
end
/**************************************************************************/
/* save routine */
/**************************************************************************/
L_SAVE:
//check whether there is mem_viol
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
s_cbranch_scc0 L_NO_PC_REWIND
//if so, need rewind PC assuming GDS operation gets NACKed
s_mov_b32 s_save_tmp, 0 //clear mem_viol bit
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc
L_NO_PC_REWIND:
s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK
s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
/* inform SPI the readiness and wait for SPI's go signal */
s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
s_mov_b32 s_save_exec_hi, exec_hi
s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
if (EMU_RUN_HACK)
else
s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
end
L_SLEEP:
s_sleep 0x2
if (EMU_RUN_HACK)
else
s_cbranch_execz L_SLEEP
end
/* setup Resource Contants */
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
//calculate wd_addr using absolute thread id
v_readlane_b32 s_save_tmp, v9, 0
s_lshr_b32 s_save_tmp, s_save_tmp, 6
s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
else
end
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
else
end
s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
//FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?)
s_mov_b32 s_save_m0, m0 //save M0
/* global mem offset */
s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
/* the first wave in the threadgroup */
s_barrier //FIXME not performance-optimal "LDS is used? wait for other waves in the same TG"
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
s_cbranch_scc0 L_SAVE_VGPR
/* save LDS */
//////////////////////////////
L_SAVE_LDS:
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
s_mov_b32 exec_hi, 0xFFFFFFFF
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
s_cbranch_scc0 L_SAVE_VGPR //no lds used? jump to L_SAVE_VGPR
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 m0, 0x0 //lds_offset initial value = 0
L_SAVE_LDS_LOOP:
if (SAVE_LDS)
buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1
end
s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //mem offset increased by 256 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete?
/* save VGPRs */
//////////////////////////////
L_SAVE_VGPR:
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
s_mov_b32 exec_hi, 0xFFFFFFFF
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 m0, 0x0 //VGPR initial index value =0
s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later
L_SAVE_VGPR_LOOP:
v_mov_b32 v0, v0 //v0 = v[0+m0]
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
end
s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
s_set_gpr_idx_off
/* save SGPRs */
//////////////////////////////
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
if (SGPR_SAVE_USE_SQC)
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes
else
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
end
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 m0, 0x0 //SGPR initial index value =0
s_nop 0x0 //Manually inserted wait states
L_SAVE_SGPR_LOOP:
s_movrels_b32 s0, s0 //s0 = s[0+m0]
write_sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4
s_add_u32 m0, m0, 1 //next sgpr index
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete?
/* save HW registers */
//////////////////////////////
L_SAVE_HWREG:
s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
write_sgpr_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
end
write_sgpr_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC
write_sgpr_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC
write_sgpr_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS
//s_save_trapsts conflicts with s_save_alloc_size
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
write_sgpr_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS
write_sgpr_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO
write_sgpr_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI
//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
write_sgpr_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
write_sgpr_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TBA_LO
write_sgpr_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TBA_HI
/* S_PGM_END_SAVED */ //FIXME graphics ONLY
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
s_rfe_b64 s_save_pc_lo //Return to the main shader program
else
end
s_branch L_END_PGM
/**************************************************************************/
/* restore routine */
/**************************************************************************/
L_RESTORE:
/* Setup Resource Contants */
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
//calculate wd_addr using absolute thread id
v_readlane_b32 s_restore_tmp, v9, 0
s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
else
end
s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
/* global mem offset */
s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
/* the first wave in the threadgroup */
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
s_cbranch_scc0 L_RESTORE_VGPR
/* restore LDS */
//////////////////////////////
L_RESTORE_LDS:
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
s_mov_b32 exec_hi, 0xFFFFFFFF
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 m0, 0x0 //lds_offset initial value = 0
L_RESTORE_LDS_LOOP:
if (SAVE_LDS)
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
end
s_add_u32 m0, m0, 256 //every buffer_load_dword does 256 bytes
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256 bytes
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?
/* restore VGPRs */
//////////////////////////////
L_RESTORE_VGPR:
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
s_mov_b32 exec_hi, 0xFFFFFFFF
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256
s_mov_b32 m0, 1 //VGPR initial index value = 1
s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
L_RESTORE_VGPR_LOOP:
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
end
s_waitcnt vmcnt(0) //ensure data ready
v_mov_b32 v0, v0 //v[0+m0] = v0
s_add_u32 m0, m0, 1 //next vgpr index
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete?
s_set_gpr_idx_off
/* VGPR restore on v0 */
if(USE_MTBUF_INSTEAD_OF_MUBUF)
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
else
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
end
/* restore SGPRs */
//////////////////////////////
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
if (SGPR_SAVE_USE_SQC)
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes
else
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
end
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
read_sgpr_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp
s_mov_b32 m0, 0x1 //SGPR initial index value =1 //go on with with s1
L_RESTORE_SGPR_LOOP:
read_sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made
s_waitcnt lgkmcnt(0) //ensure data ready
s_movreld_b32 s0, s0 //s[0+m0] = s0
s_add_u32 m0, m0, 1 //next sgpr index
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete?
s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
/* restore HW registers */
//////////////////////////////
L_RESTORE_HWREG:
s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
if (SWIZZLE_EN)
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
else
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
end
read_sgpr_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
read_sgpr_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
read_sgpr_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
read_sgpr_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
read_sgpr_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
read_sgpr_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
read_sgpr_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
read_sgpr_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
read_sgpr_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
read_sgpr_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TBA_LO
read_sgpr_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TBA_HI
s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
//for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
end
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
end
s_mov_b32 m0, s_restore_m0
s_mov_b32 exec_lo, s_restore_exec_lo
s_mov_b32 exec_hi, s_restore_exec_hi
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
//s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
//reuse s_restore_m0 as a temp register
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status
s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc
/**************************************************************************/
/* the END */
/**************************************************************************/
L_END_PGM:
s_endpgm
end
/**************************************************************************/
/* the helper functions */
/**************************************************************************/
function write_sgpr_to_mem(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
if (use_sqc)
s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
s_mov_b32 m0, s_mem_offset
s_buffer_store_dword s, s_rsrc, m0 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 4
s_mov_b32 m0, exec_lo
elsif (use_mtbuf)
v_mov_b32 v0, s
tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 256
else
v_mov_b32 v0, s
buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
s_add_u32 s_mem_offset, s_mem_offset, 256
end
end
function read_sgpr_from_mem(s, s_rsrc, s_mem_offset, use_sqc)
s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
if (use_sqc)
s_add_u32 s_mem_offset, s_mem_offset, 4
else
s_add_u32 s_mem_offset, s_mem_offset, 256
end
end
@@ -0,0 +1,21 @@
shader main
type(CS)
user_sgpr_count(4)
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v2, s2
v_mov_b32 v3, s3
flat_load_dword v4, v[0:1] slc
s_waitcnt vmcnt(0)&lgkmcnt(0)
v_mov_b32 v5, 0
s_sleep 40000
LOOP:
v_add_co_u32 v5, vcc, 1, v5
s_waitcnt vmcnt(0)&lgkmcnt(0)
v_cmp_lt_u32 vcc, v5, v4
s_cbranch_vccnz LOOP
flat_store_dword v[2,3], v5
s_waitcnt vmcnt(0)&lgkmcnt(0)
s_endpgm
end
@@ -0,0 +1,69 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x80
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x100
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x100
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
s_mov_b32 s31, 0x80
s_waitcnt 0
LOAD_LOOP:
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x100
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x100
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_endpgm
end
@@ -0,0 +1,131 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
s_waitcnt 0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_add_u32 s31, s31, 0x4
s_endpgm
end
@@ -0,0 +1,61 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
s_mov_b32 s32, 0x4000
v_mul_i32_i24 v9, v9, s32
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x10000
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 offen:1
s_waitcnt 0
s_add_u32 s31, s31, 0x10000
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
@@ -0,0 +1,79 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//write it to GDS
s_mov_b32 s30, s8
v_lshlrev_b32 v10, 2, v9
s_mov_b32 m0, 0xFFFF
s_nop 0x1
s_nop 0x1
s_nop 0x1
STORE_LOOP:
ds_write_b32 v10, v0 gds:1 // GPU hang when GPU access the GDS with GFX queue
s_waitcnt 0
v_add_u16 v10, v10, 0x10
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
v_lshlrev_b32 v10, 2, v3
LOAD_LOOP:
ds_read_b32 v11, v10 gds:1
s_waitcnt 0
v_mov_b32 v12, v11
v_add_u16 v10, v10, 0x10
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
@@ -0,0 +1,55 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read data from GDS
v_lshlrev_b32 v10, 2, v3
s_mov_b32 m0, 0xFFFF
s_nop 1
s_nop 1
s_nop 1
ds_read_b32 v11, v10 gds:1
s_waitcnt 0
//write the data to memory
buffer_store_dword v11, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
@@ -0,0 +1,68 @@
shader main
type(CS)
user_sgpr_count(4)
tgid_x_en(1)
tgid_y_en(1)
tgid_z_en(1)
s_getreg_b32 s18, hwreg(HW_REG_HW_ID, 0, 32)
s_bfe_u32 s16, s18, 0x2001e // get meid
s_bfe_u32 s17, s18, 0x20006 // get pipeid
//s_add_u32 s17, s17, s16
// get ring id
v_mov_b32 v20, s17
s_and_b32 s17, s17, 0x7
// Get thread_id inside wave
v_mbcnt_lo_u32_b32 v8, 0xffffffff, 0
v_mbcnt_hi_u32_b32 v9, 0xffffffff, v8
s_waitcnt 0
// init: gds write address
v_mov_b32 v13, 0
// the first 128DW is for ordered-append counter
v_mov_b32 v14, 0x80
// offset ring
v_mov_b32 v15, 0x200
v_mul_lo_u32 v15, v15, v20 // ring offset
v_mov_b32 v16, 0x40 // wave_size
v_mul_lo_u32 v18, v1, s1
v_add_co_u32 v18, vcc, v18, v0
v_lshrrev_b32 v17,6 ,v18
s_mov_b32 s9, s12
s_lshr_b32 s9, s9, 6
s_and_b32 s9, s9, 0x7ff
s_lshl_b32 s17, s17, 18
s_or_b32 s9, s9, s17
s_mov_b32 m0, s9
v_mov_b32 v10, 1
v_mov_b32 v11, 0
ds_ordered_count v11, v10 gds:1 offset0:0 offset1:1
s_waitcnt 0
v_mov_b32 v18, v11
v_mul_lo_u32 v16, v16, v18 // waves offset before.
v_add_co_u32 v13, vcc, v13, v14
v_add_co_u32 v13, vcc, v13, v15
v_add_co_u32 v13, vcc, v13, v16
v_add_co_u32 v13, vcc, v13, v9
v_lshlrev_b32 v13,2,v13
s_mov_b32 m0, 0x4000
s_nop 0
ds_write_b32 v13, v0 gds:1
s_waitcnt 0
s_endpgm
end
@@ -0,0 +1,79 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//store and load s8 times
s_mov_b32 s30, s8
v_lshlrev_b32 v10, 2, v3
s_mov_b32 m0, 0xFFFF
s_nop 0x1
s_nop 0x1
s_nop 0x1
STORE_LOOP:
ds_write_b32 v10, v0
s_waitcnt 0
v_add_u16 v10, v10, 0x10
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
v_lshlrev_b32 v10, 2, v3
LOAD_LOOP:
ds_read_b32 v11, v10
s_waitcnt 0
v_mov_b32 v12, v11
v_add_u16 v10, v10, 0x10
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
@@ -0,0 +1,55 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read it from LDS
v_lshlrev_b32 v10, 2, v3
s_mov_b32 m0, 0xFFFF
s_nop 1
s_nop 1
s_nop 1
ds_read_b32 v0, v10
s_waitcnt 0
//write the data to memory
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
@@ -0,0 +1,52 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//export poisoned data to L2
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
@@ -0,0 +1,77 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
//For vega20, we need to set bit 12 low. This bit will just be set low here in the shader.
//s_mov_b32 s24, 0x15c000
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//store it 10 times
v_mov_b32 v10, v0
v_mov_b32 v11, v0
v_mov_b32 v12, v0
v_mov_b32 v13, v0
v_mov_b32 v14, v0
v_mov_b32 v15, v0
v_mov_b32 v16, v0
v_mov_b32 v17, v0
v_mov_b32 v18, v0
v_mov_b32 v19, v0
// read them back
v_mov_b32 v29, v10
v_mov_b32 v28, v11
v_mov_b32 v27, v12
v_mov_b32 v26, v13
v_mov_b32 v25, v14
v_mov_b32 v24, v15
v_mov_b32 v23, v16
v_mov_b32 v22, v17
v_mov_b32 v21, v18
v_mov_b32 v20, v19
//export poisoned data to L2
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
@@ -0,0 +1,51 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//export poisoned data to L2
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
@@ -0,0 +1,55 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//SPI may touch s0...sn before shader is run
s_mov_b32 s16, s2
//write data
s_mov_b32 s30, s0
s_mov_b32 s31, s1
s_mov_b32 s32, s2
s_mov_b32 s33, s3
s_mov_b32 s34, s4
s_mov_b32 s35, s5
s_mov_b32 s36, s6
s_mov_b32 s37, s7
s_mov_b32 s38, s8
s_mov_b32 s39, s9
//read back
s_mov_b32 s0, s30
s_mov_b32 s1, s31
s_mov_b32 s2, s32
s_mov_b32 s3, s33
s_mov_b32 s4, s34
s_mov_b32 s5, s35
s_mov_b32 s6, s36
s_mov_b32 s7, s37
s_mov_b32 s8, s38
s_mov_b32 s9, s39
s_store_dword s16, s[0:1], 0x0 glc
s_endpgm
end
@@ -0,0 +1,75 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
var MTYPE_UC = 0x38000000
s_or_b32 s27, s27, MTYPE_UC
s_mov_b32 s30, s8
s_mov_b32 m0, 0x0
STORE_LOOP:
s_buffer_store_dword s8, s[20:23], m0 glc:1
s_waitcnt 0
s_add_u32 m0, m0, 4*1024 // step one 4KB page table address
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
var DEBUG_FUNCTION = 0
// Remove function check code to half shader run time...
if DEBUG_FUNCTION
s_mov_b32 s8, s30
s_mov_b32 m0, 0x0
LOAD_LOOP:
s_buffer_load_dword s0, s[20:23], m0 glc:1
s_waitcnt 0
s_add_u32 m0, m0, 4*1024
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
end
s_endpgm
end
@@ -0,0 +1,96 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
/*
s_bfe_u32 s33, s8, 0x20004 // extract bank select bits
s_lshl_b32 s33, s33, 6 // ((bank_sel & 0x3) << 6) , bank_sel = address[9:8] ^ address[7:6], if 4 bank enabled
s_and_b32 s8, s8, 0xf
*/
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
s_or_b32 s26, s26, 0x1000 //hack the buffer size to enough
STORE_LOOP:
var TOUCH_4_BANKS=1
if TOUCH_4_BANKS
s_mov_b32 m0, 0x0 // BANKA
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
s_mov_b32 m0, 0x40 // BANKB
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
/*
s_mov_b32 m0, 0x80 // BANKC
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
s_mov_b32 m0, 0xC0 // BANKD
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
*/
end
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_dcache_wb // to make emu, sim img match...
s_endpgm
end
@@ -0,0 +1,96 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
/*
s_bfe_u32 s33, s8, 0x20004 // extract bank select bits
s_lshl_b32 s33, s33, 6 // ((bank_sel & 0x3) << 6) , bank_sel = address[9:8] ^ address[7:6], if 4 bank enabled
s_and_b32 s8, s8, 0xf
*/
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
s_or_b32 s26, s26, 0x1000 //hack the buffer size to enough
STORE_LOOP:
var TOUCH_4_BANKS=1
if TOUCH_4_BANKS
s_mov_b32 m0, 0x0 // BANKA
s_buffer_store_dword s8, s[24:27], m0 glc:0
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:0
s_waitcnt 0
s_mov_b32 m0, 0x40 // BANKB
s_buffer_store_dword s8, s[24:27], m0 glc:0
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:0
s_waitcnt 0
/*
s_mov_b32 m0, 0x80 // BANKC
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
s_mov_b32 m0, 0xC0 // BANKD
s_buffer_store_dword s8, s[24:27], m0 glc:1
s_waitcnt 0
s_buffer_load_dword s32, s[24:27], m0 glc:1
s_waitcnt 0
*/
end
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_dcache_wb // to make emu, sim img match...
s_endpgm
end
@@ -0,0 +1,112 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
s_or_b32 s27, s27, 0x8000000 // changing mtype to non volatile
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
s_mov_b32 s9, 0xaa
s_mov_b32 s10, 0xbb
s_mov_b32 s11, 0xcc
// BUFFER STORE OFFSETS FOR BANK A AND BANKB
s_mov_b32 s12, 0x0
s_mov_b32 s13, 0x10
s_mov_b32 s14, 0x40
s_mov_b32 s15, 0x50
// The following sequence is needed to inject error in dirty bit ram. Sequence was provided by SQC designer 4/1/2015
//1. you have an invalid line in data cache,
//2. you write to some of the dwords in that line (the remaining dwords are still invalid),
//3. then there is a read request that hit on that line, but it needs the dwords that are not yet there in that line
//(in other words, it needs some of the invalid dwords of that line),
//4. the request will go to TC,
//5. when TC return comes back, the dirty bit rm will be read
STORE_LOOP:
var TOUCH_4_BANKS=1
if TOUCH_4_BANKS
s_mov_b32 m0, s13 // BANKA write one dword to tc
s_buffer_store_dwordx2 s[8:9], s[24:27], m0 glc:1
s_waitcnt 0
s_mov_b32 m0, s12 // BANKA. write one dword to sqc
s_buffer_store_dwordx2 s[10:11], s[24:27], m0 glc:0
s_waitcnt 0
s_mov_b32 m0, s13 // BANK A read the dword that is not in cache
s_buffer_load_dword s32, s[24:27], m0 glc:0
s_waitcnt 0
s_mov_b32 m0, s15 // BANKB write one dword to tc
s_buffer_store_dwordx2 s[8:9], s[24:27], m0 glc:1
s_waitcnt 0
s_mov_b32 m0, s14 // BANKB write one dword to sqc
s_buffer_store_dwordx2 s[10:11], s[24:27], m0 glc:0
s_waitcnt 0
s_mov_b32 m0, s15 // BANK B read the dword that is not in cache
s_buffer_load_dword s32, s[24:27], m0 glc:0
s_waitcnt 0
end
s_add_u32 s12, s12,0x80
s_add_u32 s13, s13,0x80
s_add_u32 s14, s14,0x80
s_add_u32 s15, s15,0x80
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_dcache_wb // to make emu, sim img match...
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
@@ -0,0 +1,63 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
var MTYPE_UC = 0x38000000
s_or_b32 s27, s27, MTYPE_UC
label inst_page[34+1] // 34 4k pages
for var i =0; i < 34; i++
inst_page[i]:
//each block are 4k side...
s_cbranch_execnz inst_page[i+1] //1 dword
for var j = 0; j < (4*1024)/4 -1; j++
v_mov_b32 v0, 0 // each with 1 dword
end
end
inst_page[34]:
s_endpgm
end
@@ -0,0 +1,69 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
// don't care about the loop count, fix 8 loops
// Totaly number of cacheline equals 2(A,B,)*8
var num_cache_lines = 16
label BLOCK_64B[num_cache_lines]
for var loop = 0; loop < num_cache_lines - 1; loop++
BLOCK_64B[loop]:
s_branch BLOCK_64B[loop+1] // 1DW
for var i = 0; i < 15; i++
v_nop
end
end
// last block
for var i = 0; i < 15; i++
v_nop
end
//For uei 2 msb and lsb flipped
// s_nop will become v_nop and it will a legal instruction
BLOCK_64B[num_cache_lines-1]:
for var i = 0; i < 81; i++
s_nop 0x1
end
s_endpgm
end
/** comment, four bank interleave
Addr 0x90000000 => Bank A
Addr 0x90000040 => Bank B
Addr 0x90000080 => Bank C
Addr 0x900000c0 => Bank D
Addr 0x90000100 => Bank B
Addr 0x90000140 => Bank A
Addr 0x90000180 => Bank D
Addr 0x900001c0 => Bank C
Addr 0x90000200 => Bank C
Addr 0x90000240 => Bank D
Addr 0x90000280 => Bank A
Addr 0x900002c0 => Bank B
Addr 0x90000300 => Bank D
Addr 0x90000340 => Bank C
Addr 0x90000380 => Bank B
**/
@@ -0,0 +1,29 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
for var i = 0; i < 1000; i++
s_nop 0x1
end
s_endpgm
end
@@ -0,0 +1,51 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read from memory
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
//write the data to memory
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
@@ -0,0 +1,73 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
s_mov_b32 s16, 0xa5a50001
s_store_dword s16, s[0:1], 0x40 glc
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
s_mov_b32 s31, 0x0
LOAD_LOOP:
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end
@@ -0,0 +1,71 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
//For vega20, we need to set bit 12 low to steer traffic to ea0
s_mov_b32 s32, 0xFFFFEFFF
s_and_b32 s24, s24, s32
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
s_mov_b32 s31, 0x0
LOAD_LOOP:
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_endpgm
end
@@ -0,0 +1,345 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
//set bit 12 low to select EA0
s_mov_b32 s32, 0xFFFFEFFF
s_and_b32 s24, s24, s32
s_and_b32 s31, s9, 0x1
s_cmpk_eq_i32 s31, 0x1
s_cbranch_scc1 ODD_WAVES
//set bit 12 high to select EA1
s_mov_b32 s32, 0x1000
s_or_b32 s24, s24, s32
ODD_WAVES:
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
v_mul_i32_i24 v3, v3, 0x10
v_mul_i32_i24 v9, v9, 0x10
s_mov_b32 s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
s_barrier
s_mov_b32 s31, 0xF0000
s_mov_b32 s32, 0x6000
//store and load s8 times
s_mov_b32 s30, s8
s_cmpk_lt_i32 s9, 0x2
s_cbranch_scc1 ATOMIC_LOOP
LOAD_LOOP:
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
s_add_i32 s31, s31, 0x4000
s_sub_u32 s30, s30, 1
s_cmpk_eq_u32 s30, 0
s_cbranch_scc0 LOAD_LOOP
s_cmpk_ge_i32 s9, 0x2
s_cbranch_scc1 END
ATOMIC_LOOP:
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
s_sub_u32 s30, s30, 1
s_cmpk_eq_u32 s30, 0
s_cbranch_scc0 ATOMIC_LOOP
//s_waitcnt 0
END:
s_waitcnt 0
s_endpgm
end
@@ -0,0 +1,509 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
//set bit 12 low to select EA0
s_mov_b32 s32, 0xFFFFEFFF
s_and_b32 s24, s24, s32
s_and_b32 s31, s9, 0x1
s_cmpk_eq_i32 s31, 0x1
s_cbranch_scc1 ODD_WAVES
//set bit 12 high to select EA1
s_mov_b32 s32, 0x1000
s_or_b32 s24, s24, s32
ODD_WAVES:
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
v_mul_i32_i24 v3, v3, 0x10
v_mul_i32_i24 v9, v9, 0x10
s_mov_b32 s31, 0x9000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
s_add_i32 s31, s31, 0x6000
s_barrier
s_mov_b32 s31, 0xF0000
s_mov_b32 s32, 0x9000
//store and load s8 times
s_mov_b32 s30, s8
s_cmpk_lt_i32 s9, 0x2
s_cbranch_scc1 ATOMIC_LOOP
s_mov_b32 s20, 0x1
LOAD_LOOP:
s_atomic_add s20, s0, 0x100000
s_atomic_add s20, s0, 0x100010
s_atomic_add s20, s0, 0x100020
s_atomic_add s20, s0, 0x100030
s_atomic_add s20, s0, 0x100040
s_atomic_add s20, s0, 0x100050
s_atomic_add s20, s0, 0x100060
s_atomic_add s20, s0, 0x100070
s_atomic_add s20, s0, 0x100080
s_atomic_add s20, s0, 0x100090
s_atomic_add s20, s0, 0x100100
s_atomic_add s20, s0, 0x100110
s_atomic_add s20, s0, 0x100120
s_atomic_add s20, s0, 0x100130
s_atomic_add s20, s0, 0x100140
s_atomic_add s20, s0, 0x100150
s_atomic_add s20, s0, 0x100160
s_atomic_add s20, s0, 0x100170
s_atomic_add s20, s0, 0x100180
s_atomic_add s20, s0, 0x100190
s_atomic_add s20, s0, 0x100200
s_atomic_add s20, s0, 0x100210
s_atomic_add s20, s0, 0x100220
s_atomic_add s20, s0, 0x100230
s_atomic_add s20, s0, 0x100240
s_atomic_add s20, s0, 0x100250
s_atomic_add s20, s0, 0x100260
s_atomic_add s20, s0, 0x100270
s_atomic_add s20, s0, 0x100280
s_atomic_add s20, s0, 0x100290
s_atomic_add s20, s0, 0x100300
s_atomic_add s20, s0, 0x100310
s_atomic_add s20, s0, 0x100320
s_atomic_add s20, s0, 0x100330
s_atomic_add s20, s0, 0x100340
s_atomic_add s20, s0, 0x100350
s_atomic_add s20, s0, 0x100360
s_atomic_add s20, s0, 0x100370
s_atomic_add s20, s0, 0x100380
s_atomic_add s20, s0, 0x100390
s_atomic_add s20, s0, 0x100400
s_atomic_add s20, s0, 0x100404
s_atomic_add s20, s0, 0x100408
s_atomic_add s20, s0, 0x10040c
s_atomic_add s20, s0, 0x100410
s_atomic_add s20, s0, 0x100414
s_atomic_add s20, s0, 0x100418
s_atomic_add s20, s0, 0x10041c
s_atomic_add s20, s0, 0x100420
s_atomic_add s20, s0, 0x100424
s_atomic_add s20, s0, 0x100428
s_atomic_add s20, s0, 0x10042c
s_atomic_add s20, s0, 0x100500
s_atomic_add s20, s0, 0x100504
s_atomic_add s20, s0, 0x100508
s_atomic_add s20, s0, 0x10050c
s_atomic_add s20, s0, 0x100510
s_atomic_add s20, s0, 0x100514
s_atomic_add s20, s0, 0x100518
s_atomic_add s20, s0, 0x10051c
s_atomic_add s20, s0, 0x100520
s_atomic_add s20, s0, 0x100524
s_atomic_add s20, s0, 0x100528
s_atomic_add s20, s0, 0x10052c
s_atomic_add s20, s0, 0x100600
s_atomic_add s20, s0, 0x100604
s_atomic_add s20, s0, 0x100608
s_atomic_add s20, s0, 0x10060c
s_atomic_add s20, s0, 0x100610
s_atomic_add s20, s0, 0x100614
s_atomic_add s20, s0, 0x100618
s_atomic_add s20, s0, 0x10061c
s_atomic_add s20, s0, 0x100620
s_atomic_add s20, s0, 0x100624
s_atomic_add s20, s0, 0x100628
s_atomic_add s20, s0, 0x10062c
s_atomic_add s20, s0, 0x100700
s_atomic_add s20, s0, 0x100704
s_atomic_add s20, s0, 0x100708
s_atomic_add s20, s0, 0x10070c
s_atomic_add s20, s0, 0x100710
s_atomic_add s20, s0, 0x100714
s_atomic_add s20, s0, 0x100718
s_atomic_add s20, s0, 0x10071c
s_atomic_add s20, s0, 0x100720
s_atomic_add s20, s0, 0x100724
s_atomic_add s20, s0, 0x100728
s_atomic_add s20, s0, 0x10072c
s_atomic_add s20, s0, 0x100800
s_atomic_add s20, s0, 0x100804
s_atomic_add s20, s0, 0x100808
s_atomic_add s20, s0, 0x10080c
s_atomic_add s20, s0, 0x100810
s_atomic_add s20, s0, 0x100814
s_atomic_add s20, s0, 0x100818
s_atomic_add s20, s0, 0x10081c
s_atomic_add s20, s0, 0x100820
s_atomic_add s20, s0, 0x100824
s_atomic_add s20, s0, 0x100828
s_atomic_add s20, s0, 0x10082c
s_atomic_add s20, s0, 0x100900
s_atomic_add s20, s0, 0x100904
s_atomic_add s20, s0, 0x100908
s_atomic_add s20, s0, 0x10090c
s_atomic_add s20, s0, 0x100910
s_atomic_add s20, s0, 0x100914
s_atomic_add s20, s0, 0x100918
s_atomic_add s20, s0, 0x10091c
s_atomic_add s20, s0, 0x100920
s_atomic_add s20, s0, 0x100924
s_atomic_add s20, s0, 0x100928
s_atomic_add s20, s0, 0x10092c
s_atomic_add s20, s0, 0x100a00
s_atomic_add s20, s0, 0x100a04
s_atomic_add s20, s0, 0x100a08
s_atomic_add s20, s0, 0x100a0c
s_atomic_add s20, s0, 0x100a10
s_atomic_add s20, s0, 0x100a14
s_atomic_add s20, s0, 0x100a18
s_atomic_add s20, s0, 0x100a1c
s_atomic_add s20, s0, 0x100a20
s_atomic_add s20, s0, 0x100a24
s_atomic_add s20, s0, 0x100a28
s_atomic_add s20, s0, 0x100a2c
s_atomic_add s20, s0, 0x100b00
s_atomic_add s20, s0, 0x100b04
s_atomic_add s20, s0, 0x100b08
s_atomic_add s20, s0, 0x100b0c
s_atomic_add s20, s0, 0x100b10
s_atomic_add s20, s0, 0x100b14
s_atomic_add s20, s0, 0x100b18
s_atomic_add s20, s0, 0x100b1c
s_atomic_add s20, s0, 0x100b20
s_atomic_add s20, s0, 0x100b24
s_atomic_add s20, s0, 0x100b28
s_atomic_add s20, s0, 0x100b2c
s_atomic_add s20, s0, 0x100c00
s_atomic_add s20, s0, 0x100c04
s_atomic_add s20, s0, 0x100c08
s_atomic_add s20, s0, 0x100c0c
s_atomic_add s20, s0, 0x100c10
s_atomic_add s20, s0, 0x100c14
s_atomic_add s20, s0, 0x100c18
s_atomic_add s20, s0, 0x100c1c
s_atomic_add s20, s0, 0x100c20
s_atomic_add s20, s0, 0x100c24
s_atomic_add s20, s0, 0x100c28
s_atomic_add s20, s0, 0x100c2c
s_sub_u32 s30, s30, 1
s_cmpk_eq_u32 s30, 0
s_cbranch_scc0 LOAD_LOOP
s_cmpk_ge_i32 s9, 0x2
s_cbranch_scc1 END
ATOMIC_LOOP:
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
s_add_i32 s32, s32, 0x2000
s_sub_u32 s30, s30, 1
s_cmpk_eq_u32 s30, 0
s_cbranch_scc0 ATOMIC_LOOP
//s_waitcnt 0
END:
s_waitcnt 0
s_endpgm
end
File diff soppresso perché troppo grande Carica Diff
@@ -0,0 +1,80 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
var MTYPE_UC = 0x38000000
s_or_b32 s27, s27, MTYPE_UC
//store and load s8 times
s_mov_b32 s8, 33 // store 33 times to overflow atcl1 cache...
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
STORE_LOOP:
v_add_co_u32 v0, vcc[0:1], v0, 2
buffer_store_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
s_waitcnt 0
s_add_u32 s31, s31, 4*1024 // step one 4KB page size
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
var DEBUG_FUNCTION = 0
//remove code to half shader run time
if DEBUG_FUNCTION
s_mov_b32 s8, s30
s_mov_b32 s31, 0x0
LOAD_LOOP:
buffer_load_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 4*1024
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
end
s_endpgm
end
@@ -0,0 +1,80 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
var MTYPE_UC = 0x38000000
s_or_b32 s27, s27, MTYPE_UC
//store and load s8 times
s_mov_b32 s8, 33 // store 33 times to overflow atcl1 cache...
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
STORE_LOOP:
v_add_co_u32 v0, vcc[0:1], v0, 2
buffer_store_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
s_waitcnt 0
s_add_u32 s31, s31, 4*1024 // step one 4KB page size
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
var DEBUG_FUNCTION = 1
//remove code to half shader run time
if DEBUG_FUNCTION
s_mov_b32 s8, 0x20
s_mov_b32 s31, 0xffc
LOAD_LOOP:
buffer_load_dwordx2 v[0:1], v9, s20, s31 idxen:1 glc:1 slc:1
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 4*1024
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
end
s_endpgm
end
@@ -0,0 +1,72 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//bump up the addresses being accessed to generate multiple reads to the pde memories
v_mul_u32_u24 v9, 65536, v9
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
//Hack number of records to avoid range checking which we don't want since we want to generate
//out of range accesses. we are really trying to generate many reads to the PDEs to get FUE.
s_mov_b32 s26, 0xffffffff
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
s_mov_b32 s31, 0x0
LOAD_LOOP:
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_endpgm
end
@@ -0,0 +1,72 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//bump up the addresses being accessed to generate multiple reads to the pde memories
v_mul_u32_u24 v9, 4096, v9
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
//Hack number of records to avoid range checking which we don't want since we want to generate
//out of range accesses. we are really trying to generate many reads to the PDEs to get FUE.
s_mov_b32 s26, 0xffffffff
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_mov_b32 s8, s30
s_mov_b32 s31, 0x0
LOAD_LOOP:
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
v_mov_b32 v12, v0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
s_endpgm
end
@@ -0,0 +1,47 @@
shader main
type(CS)
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
tgid_x_en(1) //s_tgid_x s8
tgid_y_en(1) //s_tgid_y s9
tgid_z_en(1) //s_tgid_z s10
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
s_waitcnt 0
s_endpgm
end
@@ -0,0 +1,54 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0
v_mad_u32_u24 v3, v2, s3, v3
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3
//store and load s8 times
s_mov_b32 s30, s8
s_mov_b32 s31, 0x0
STORE_LOOP:
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
s_waitcnt 0
s_add_u32 s31, s31, 0x4
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP
s_endpgm
end
@@ -0,0 +1,54 @@
shader main
type(CS)
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X
//s6 X*Y
//s7 output offset
//s8 loop
tgid_x_en(1) //s_tgid_x s9
tgid_y_en(1) //s_tgid_y s10
tgid_z_en(1) //s_tgid_z s11
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
s_mov_b32 s16, s2
//SPI may touch v0,v1,v2 before shader is run
//store it 10 times
v_mov_b32 v10, v1
v_mov_b32 v11, v2
v_mov_b32 v12, v1
v_mov_b32 v13, v2
v_mov_b32 v14, v1
v_mov_b32 v15, v2
v_mov_b32 v16, v1
v_mov_b32 v17, v2
v_mov_b32 v18, v1
v_mov_b32 v19, v0
// read them back
v_mov_b32 v29, v10
v_mov_b32 v28, v11
v_mov_b32 v27, v12
v_mov_b32 v26, v13
v_mov_b32 v25, v14
v_mov_b32 v24, v15
v_mov_b32 v23, v16
v_mov_b32 v22, v17
v_mov_b32 v21, v18
v_mov_b32 v20, v19
s_store_dword s16, s[0:1], 0x0 glc
s_endpgm
end
@@ -0,0 +1,75 @@
shader main
type(CS)
user_sgpr_count(2) // 2 for the buffer resource + 5 for thread/thread group parameters
//s[0:1] the mmeory address for the buffer resource
tgid_x_en(1) //s_tgid_x s2
tgid_y_en(1) //s_tgid_y s3
tgid_z_en(1) //s_tgid_z s4
//vo for tid_x
//v1 for tid_y
//v2 for tid_z
for var vgpr = 0; vgpr < 256; ++vgpr
v_accvgpr_read v[vgpr], acc[vgpr]
end
for var vgpr = 0; vgpr < 256; ++vgpr
v_accvgpr_write acc[vgpr], v[vgpr]
end
s_movk_i32 m0, 0x0000
s_mov_b32 s10, 0x000000f8
s_set_gpr_idx_on s10, 0x8
label_0004:
v_mov_b32 v0, 0
v_mov_b32 v1, 0
v_mov_b32 v2, 0
v_mov_b32 v3, 0
v_mov_b32 v4, 0
v_mov_b32 v5, 0
v_mov_b32 v6, 0
v_mov_b32 v7, 0
s_sub_u32 s10, s10, 8
s_set_gpr_idx_idx s10
s_cbranch_scc0 label_0004
s_set_gpr_idx_off
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
v_mul_u32_u24 v1, 8, v1
s_getreg_b32 s11, hwreg(HW_REG_HW_ID, 4, 2)
s_mulk_i32 s11, 0x4000
v_add_co_u32 v1, vcc, v1, s11
s_mov_b32 s10, 7
s_mov_b32 m0, -1
label_001B:
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
v_add_co_u32 v1, vcc, 0x00000800, v1
s_sub_u32 s10, s10, 1
s_cbranch_scc0 label_001B
s_getreg_b32 s20, hwreg(HW_REG_HW_ID, 0, 32)
// s12 = SIMD
s_lshr_b32 s12,s20,4
s_and_b32 s12, s12, 0x3
// s13 = CU
s_lshr_b32 s13,s20,8
s_and_b32 s13, s13, 0xf
// s14 = SE
s_lshr_b32 s14,s20,13
s_and_b32 s14, s14, 0x7
// s15 = SE * 16 * 4 + CU * 4 + SIMD
s_mul_i32 s16, s14, 64
s_mul_i32 s17, s13, 4
s_add_i32 s15, s16, s17
s_add_i32 s15, s15, s12
s_mul_i32 s16, s15, 4
s_store_dword s15, s[0:1], s16 glc
s_waitcnt 0
s_endpgm
end
@@ -0,0 +1,58 @@
//s[0:1]: buffer resource
//s2: num_threads_x_full
//s3: num_threads_x_full * num_threads_y_full
//s4: num_threads_x_full * num_threads_y_full * num_threads_z_full
//s5: COMPUTE_DIM_X
//s6: COMPUTE_DIM_X * COMPUTE_DIM_Y
//s7: loop_lifetime
//s8: dispatch_offset
//s[9:11]: thread group ID
//v[0:2]: thread ID
shader main
type(CS)
user_sgpr_count(9)
tgid_x_en(1)
tgid_y_en(1)
tgid_z_en(1)
//sp3 loop for lifetime
s_mov_b32 s12, 0 //init loop idx s12
label_0004:
s_cmp_lt_i32 s12, s7 //scc = (s12 < s7) ? 1 : 0
s_cbranch_scc0 label_0006 //if(scc == 0) then jump to label_0006; else nop
v_mov_b32 v4,s12
s_add_i32 s12, s12, 1 //add loop incr
s_branch label_0004
label_0006: //end of SP3 loop
//v3 thread_id_in_group = (tid_z * num_threads_x_full * num_threads_y_full) + (tid_y * num_threads_x_full) + tid_x
v_mad_u32_u24 v3, v1, s2, v0 //v3 = tid_y * num_threads_x_full + tid_x
v_mad_u32_u24 v3, v2, s3, v3 //v3 = tid_z * num_threads_x_ful * num_threads_y_full + v3
//s28 thread_group_id = (tgid_z * COMPUTE_DIM_X * COMPUTE_DIM_Y) + (tgid_y * COMPUTE_DIM_X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5 //tgid_y * COMPUTE_DIM_X
s_add_i32 s28, s28, s_tgid_x //tgid_y * COMPUTE_DIM_X + tgid_x
s_mul_i32 s29, s6, s_tgid_z //tgid_z * COMPUTE_DIM_X * COMPUTE_DIM_Y
s_add_i32 s28, s29, s28
//v9 absolute thread id = thread_group_id * (num_threads_x_full * num_threads_y_full * num_threads_z_full) + thread_id_in_group
v_mov_b32 v9, s28 //thread_group_id
v_mad_u32_u24 v9, v9, s4, v3
//fetch the buffer resource
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0
//write absolute thread id using it as an index
buffer_store_dword v9, v9, s24, s8 idxen:1
s_waitcnt 0
s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc
s_endpgm
end

Alcuni file non sono stati mostrati perché troppi file sono cambiati in questo diff Mostra Altro