Create prebuild raslib package for RDC
Create a folder for prebuild raslib which contains the RAS binary and configure files. The CMakeLists.txt is changed to include those files. Change-Id: I530198cff5686a19e58096c87457ab8b7c52d5f3
Этот коммит содержится в:
@@ -236,6 +236,17 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/example
|
||||
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}
|
||||
COMPONENT ${CLIENT_COMPONENT})
|
||||
|
||||
# Prebuild packages to install
|
||||
install(FILES ${PROJECT_SOURCE_DIR}/ras_prebuild/librdc_ras.so
|
||||
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}/lib
|
||||
COMPONENT ${CLIENT_COMPONENT})
|
||||
install(DIRECTORY ${PROJECT_SOURCE_DIR}/ras_prebuild/config
|
||||
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}/lib
|
||||
COMPONENT ${CLIENT_COMPONENT})
|
||||
install(DIRECTORY ${PROJECT_SOURCE_DIR}/ras_prebuild/sp3
|
||||
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}/lib
|
||||
COMPONENT ${CLIENT_COMPONENT})
|
||||
|
||||
|
||||
set(CPACK_PACKAGE_NAME ${RDC_PACKAGE})
|
||||
set(CPACK_PACKAGE_VERSION ${VERSION_STRING})
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"version": "0.0.1",
|
||||
"devices": [
|
||||
{
|
||||
"name": "VEGA20",
|
||||
"ids": [ "0x66A0", "0x66A1", "0x66A2", "0x66A3", "0x66A4", "0x66A7", "0x66AF" ],
|
||||
"config": "vega20.json",
|
||||
"gfx": "libgfx9.so",
|
||||
"sdma": "libsdma4.so"
|
||||
},
|
||||
{
|
||||
"name": "ARCTURUS",
|
||||
"ids": [ "0x738C", "0x7388", "0x738E" ],
|
||||
"config": "arcturus.json",
|
||||
"gfx": "libgfx9.so",
|
||||
"sdma": "libsdma4.so"
|
||||
},
|
||||
{
|
||||
"name": "SIENNA_CICHLID",
|
||||
"ids": [ "0x73A0", "0x73A2", "0x73A3", "0x73AB", "0x73AE", "0x73BF" ],
|
||||
"config": "sienna_cichlid.json",
|
||||
"gfx": "libgfx10.so",
|
||||
"sdma": "libsdma5.so"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"version": "0.0.1",
|
||||
"type": {
|
||||
"parity": 1,
|
||||
"single_correctable": 2,
|
||||
"multi_uncorrectable": 4,
|
||||
"poison": 8
|
||||
},
|
||||
"block": {
|
||||
"umc": {
|
||||
"index": 0,
|
||||
"support": 1,
|
||||
"type": [
|
||||
"single_correctable",
|
||||
"multi_uncorrectable",
|
||||
"poison"
|
||||
]
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"name": "ras_umc.0.2",
|
||||
"block": "umc",
|
||||
"type": "single_correctable",
|
||||
"nullDispatchCS": "sp3/gfx10/edc/bin/sienna_cichlid/gc_edc_sqc_inst_bank_snop.bin"
|
||||
},
|
||||
{
|
||||
"name": "ras_umc.0.4",
|
||||
"block": "umc",
|
||||
"type": "multi_uncorrectable",
|
||||
"nullDispatchCS": "sp3/gfx10/edc/bin/sienna_cichlid/gc_edc_sqc_inst_bank_snop.bin"
|
||||
}
|
||||
]
|
||||
}
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Двоичные данные
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
@@ -0,0 +1,31 @@
|
||||
shader main
|
||||
asic(GFX10)
|
||||
wave_size(32)
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
for var i = 0; i < 1000; i++
|
||||
s_nop 0x1
|
||||
end
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
shader main
|
||||
type(CS)
|
||||
user_sgpr_count(0)
|
||||
|
||||
// Clear ACC VGPR
|
||||
for var vgpr = 0; vgpr < 256; ++vgpr
|
||||
v_accvgpr_write acc[vgpr], 0
|
||||
end
|
||||
|
||||
s_movk_i32 m0, 0x0000
|
||||
s_mov_b32 s10, 0x000000f8
|
||||
s_set_gpr_idx_on s10, 0x8
|
||||
label_0004:
|
||||
v_mov_b32 v0, 0
|
||||
v_mov_b32 v1, 0
|
||||
v_mov_b32 v2, 0
|
||||
v_mov_b32 v3, 0
|
||||
v_mov_b32 v4, 0
|
||||
v_mov_b32 v5, 0
|
||||
v_mov_b32 v6, 0
|
||||
v_mov_b32 v7, 0
|
||||
s_sub_u32 s10, s10, 8
|
||||
s_set_gpr_idx_idx s10
|
||||
s_cbranch_scc0 label_0004
|
||||
s_set_gpr_idx_off
|
||||
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
|
||||
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
|
||||
v_mul_u32_u24 v1, 8, v1
|
||||
s_getreg_b32 s11, hwreg(HW_REG_HW_ID, 4, 2)
|
||||
s_mulk_i32 s11, 0x4000
|
||||
v_add_co_u32 v1, vcc, v1, s11
|
||||
s_mov_b32 s10, 7
|
||||
s_mov_b32 m0, -1
|
||||
label_001B:
|
||||
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
|
||||
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
|
||||
v_add_co_u32 v1, vcc, 0x00000800, v1
|
||||
s_sub_u32 s10, s10, 1
|
||||
s_cbranch_scc0 label_001B
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
@@ -0,0 +1,113 @@
|
||||
shader main
|
||||
type(CS)
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
// Clear ACC VGPR
|
||||
for var vgpr = 0; vgpr < 256; ++vgpr
|
||||
v_accvgpr_write acc[vgpr], 0
|
||||
end
|
||||
|
||||
//sp3 loop for lifetime
|
||||
s_mov_b32 s12, 0 //init loop idx s12
|
||||
label_0001:
|
||||
s_cmp_lt_i32 s12, s8 //scc = (s12 < s8) ? 1 : 0
|
||||
s_cbranch_scc0 label_0006 //if(scc == 0) then jump to label_0006; else nop
|
||||
v_mov_b32 v4,s12
|
||||
s_add_i32 s12, s12, 1 //add loop incr
|
||||
s_branch label_0001
|
||||
label_0006: //end of SP3 loop
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
s_load_dwordx4 s[40:43], s[0:1], 0x20
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
// Clear VGPR and LDS
|
||||
s_movk_i32 m0, 0x0000
|
||||
s_mov_b32 s12, 0x000000f8
|
||||
s_set_gpr_idx_on s12, 0x8
|
||||
label_0004:
|
||||
v_mov_b32 v0, 0
|
||||
v_mov_b32 v1, 0
|
||||
v_mov_b32 v2, 0
|
||||
v_mov_b32 v3, 0
|
||||
v_mov_b32 v4, 0
|
||||
v_mov_b32 v5, 0
|
||||
v_mov_b32 v6, 0
|
||||
v_mov_b32 v7, 0
|
||||
s_sub_u32 s12, s12, 8
|
||||
s_set_gpr_idx_idx s12
|
||||
s_cbranch_scc0 label_0004
|
||||
s_set_gpr_idx_off
|
||||
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
|
||||
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
|
||||
v_mul_u32_u24 v1, 8, v1
|
||||
s_getreg_b32 s13, hwreg(HW_REG_HW_ID, 4, 2)
|
||||
s_mulk_i32 s13, 0x4000
|
||||
v_add_co_u32 v1, vcc, v1, s13
|
||||
s_mov_b32 s12, 7
|
||||
s_mov_b32 m0, -1
|
||||
label_001B:
|
||||
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
|
||||
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
|
||||
v_add_co_u32 v1, vcc, 0x00000800, v1
|
||||
s_sub_u32 s12, s12, 1
|
||||
s_cbranch_scc0 label_001B
|
||||
|
||||
// Save coverage in the memory
|
||||
s_getreg_b32 s20, hwreg(HW_REG_HW_ID, 0, 32)
|
||||
// s12 = SIMD
|
||||
s_lshr_b32 s12,s20,4
|
||||
s_and_b32 s12, s12, 0x3
|
||||
// s13 = CU
|
||||
s_lshr_b32 s13,s20,8
|
||||
s_and_b32 s13, s13, 0xf
|
||||
// s14 = SE
|
||||
s_lshr_b32 s14,s20,13
|
||||
s_and_b32 s14, s14, 0x7
|
||||
// s15 = SE * 16 * 4 + CU * 4 + SIMD
|
||||
s_mul_i32 s16, s14, 64
|
||||
s_mul_i32 s17, s13, 4
|
||||
s_add_i32 s15, s16, s17
|
||||
s_add_i32 s15, s15, s12
|
||||
s_mul_i32 s16, s15, 4
|
||||
|
||||
s_buffer_store_dword s15, s24, s16 glc
|
||||
s_waitcnt 0
|
||||
|
||||
s_buffer_load_dword s17, s24, s16 glc
|
||||
s_waitcnt 0
|
||||
s_endpgm
|
||||
end
|
||||
@@ -0,0 +1,59 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
v_mov_b32 v10, v0
|
||||
//buffer_load_dword v10, v9, s24, s31 idxen:1 glc:1
|
||||
//s_waitcnt 0
|
||||
//v_mov_b32 v11, v1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
|
||||
//read from the GDS
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
s_mov_b32 m0, 0xFFFF
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
|
||||
ds_read_b32 v11, v10 gds:1
|
||||
s_waitcnt 0
|
||||
|
||||
v_mov_b32 v12, v11
|
||||
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,673 @@
|
||||
shader main
|
||||
|
||||
type(CS)
|
||||
|
||||
/*************************************************************************/
|
||||
/* control on how to run the shader */
|
||||
/*************************************************************************/
|
||||
//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
|
||||
var EMU_RUN_HACK = 1
|
||||
var EMU_RUN_HACK_RESTORE_NORMAL = 0
|
||||
var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
|
||||
var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
|
||||
var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
|
||||
var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
|
||||
var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
|
||||
var SAVE_LDS = 0
|
||||
var WG_BASE_ADDR_LO = 0x9000a000
|
||||
var WG_BASE_ADDR_HI = 0x0
|
||||
var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem
|
||||
var CTX_SAVE_CONTROL = 0x0
|
||||
var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
|
||||
var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
|
||||
var SGPR_SAVE_USE_SQC = 0 //use SQC D$ to do the write
|
||||
var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC)
|
||||
var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
|
||||
|
||||
/**************************************************************************/
|
||||
/* variables */
|
||||
/**************************************************************************/
|
||||
var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
|
||||
var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
|
||||
|
||||
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
|
||||
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
|
||||
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
|
||||
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
|
||||
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
|
||||
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
|
||||
|
||||
var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
|
||||
var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
|
||||
var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
|
||||
var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
|
||||
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
|
||||
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
|
||||
var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
|
||||
var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
|
||||
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
|
||||
var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
|
||||
|
||||
var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
|
||||
var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
|
||||
var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
|
||||
|
||||
var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
|
||||
var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
|
||||
|
||||
|
||||
/* Save */
|
||||
var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
|
||||
var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
|
||||
|
||||
var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
|
||||
var S_SAVE_SPI_INIT_ATC_SHIFT = 27
|
||||
var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
|
||||
var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
|
||||
var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
|
||||
var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
|
||||
|
||||
var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
|
||||
var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
|
||||
var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
|
||||
var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
|
||||
|
||||
var s_save_spi_init_lo = exec_lo
|
||||
var s_save_spi_init_hi = exec_hi
|
||||
|
||||
//tba_lo and tba_hi need to be saved/restored
|
||||
var tba_lo = ttmp12
|
||||
var tba_hi = ttmp13
|
||||
var tma_lo = ttmp14
|
||||
var tma_hi = ttmp15
|
||||
|
||||
var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
|
||||
var s_save_pc_hi = ttmp1
|
||||
var s_save_exec_lo = ttmp2
|
||||
var s_save_exec_hi = ttmp3
|
||||
var s_save_status = ttmp4
|
||||
var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
|
||||
var s_save_xnack_mask_lo = ttmp6
|
||||
var s_save_xnack_mask_hi = ttmp7
|
||||
var s_save_buf_rsrc0 = ttmp8
|
||||
var s_save_buf_rsrc1 = ttmp9
|
||||
var s_save_buf_rsrc2 = ttmp10
|
||||
var s_save_buf_rsrc3 = ttmp11
|
||||
|
||||
var s_save_mem_offset = tma_lo
|
||||
var s_save_alloc_size = s_save_trapsts //conflict
|
||||
var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time)
|
||||
var s_save_m0 = tma_hi
|
||||
|
||||
/* Restore */
|
||||
var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
|
||||
var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
|
||||
|
||||
var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
|
||||
var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
|
||||
var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
|
||||
var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
|
||||
var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
|
||||
var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
|
||||
|
||||
var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
|
||||
var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
|
||||
var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
|
||||
var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
|
||||
|
||||
var s_restore_spi_init_lo = exec_lo
|
||||
var s_restore_spi_init_hi = exec_hi
|
||||
|
||||
var s_restore_mem_offset = ttmp2
|
||||
var s_restore_alloc_size = ttmp3
|
||||
var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored
|
||||
var s_restore_mem_offset_save = s_restore_tmp //no conflict
|
||||
|
||||
var s_restore_m0 = s_restore_alloc_size //no conflict
|
||||
|
||||
var s_restore_mode = ttmp7
|
||||
|
||||
var s_restore_pc_lo = ttmp0
|
||||
var s_restore_pc_hi = ttmp1
|
||||
var s_restore_exec_lo = tma_lo //no conflict
|
||||
var s_restore_exec_hi = tma_hi //no conflict
|
||||
var s_restore_status = ttmp4
|
||||
var s_restore_trapsts = ttmp5
|
||||
var s_restore_xnack_mask_lo = xnack_mask_lo
|
||||
var s_restore_xnack_mask_hi = xnack_mask_hi
|
||||
var s_restore_buf_rsrc0 = ttmp8
|
||||
var s_restore_buf_rsrc1 = ttmp9
|
||||
var s_restore_buf_rsrc2 = ttmp10
|
||||
var s_restore_buf_rsrc3 = ttmp11
|
||||
|
||||
/**************************************************************************/
|
||||
/* trap handler entry points */
|
||||
/**************************************************************************/
|
||||
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
|
||||
//FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
|
||||
s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
|
||||
s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
|
||||
s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
|
||||
//FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
|
||||
s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
|
||||
else
|
||||
s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
|
||||
end
|
||||
|
||||
L_JUMP_TO_RESTORE:
|
||||
s_branch L_RESTORE //restore
|
||||
|
||||
L_SKIP_RESTORE:
|
||||
|
||||
s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
|
||||
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
|
||||
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
|
||||
s_cbranch_scc1 L_SAVE //this is the operation for save
|
||||
//the poential code (such as restore STATUS) on this path is for regular trap handling and don't care for compute save & restore
|
||||
|
||||
//EMU will not execute the code since in hack mode it is skipped while in normal mode there is no save in EMU
|
||||
//SIM will only execute the code in normal S/R mode but not in hack mode
|
||||
if (!EMU_RUN_HACK)
|
||||
L_ERROR: //to catch incorrect savectx setting in SIM assuming the trap handler is only used for save & restore
|
||||
s_branch L_ERROR
|
||||
end
|
||||
|
||||
/**************************************************************************/
|
||||
/* save routine */
|
||||
/**************************************************************************/
|
||||
|
||||
L_SAVE:
|
||||
|
||||
//check whether there is mem_viol
|
||||
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
|
||||
s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
|
||||
s_cbranch_scc0 L_NO_PC_REWIND
|
||||
|
||||
//if so, need rewind PC assuming GDS operation gets NACKed
|
||||
s_mov_b32 s_save_tmp, 0 //clear mem_viol bit
|
||||
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
|
||||
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
|
||||
s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
|
||||
s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc
|
||||
|
||||
L_NO_PC_REWIND:
|
||||
s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
|
||||
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
|
||||
|
||||
s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK
|
||||
s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi
|
||||
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
|
||||
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
|
||||
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
|
||||
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
|
||||
s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
|
||||
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
|
||||
s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
|
||||
s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
|
||||
|
||||
s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
|
||||
|
||||
/* inform SPI the readiness and wait for SPI's go signal */
|
||||
s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
|
||||
s_mov_b32 s_save_exec_hi, exec_hi
|
||||
s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
|
||||
if (EMU_RUN_HACK)
|
||||
|
||||
else
|
||||
s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
|
||||
end
|
||||
|
||||
L_SLEEP:
|
||||
s_sleep 0x2
|
||||
|
||||
if (EMU_RUN_HACK)
|
||||
|
||||
else
|
||||
s_cbranch_execz L_SLEEP
|
||||
end
|
||||
|
||||
|
||||
/* setup Resource Contants */
|
||||
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
|
||||
//calculate wd_addr using absolute thread id
|
||||
v_readlane_b32 s_save_tmp, v9, 0
|
||||
s_lshr_b32 s_save_tmp, s_save_tmp, 6
|
||||
s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
|
||||
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
|
||||
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
|
||||
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
|
||||
else
|
||||
end
|
||||
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
|
||||
s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
|
||||
s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
|
||||
s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
|
||||
else
|
||||
end
|
||||
|
||||
|
||||
s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
|
||||
s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
|
||||
s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
|
||||
s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
|
||||
s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
|
||||
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
|
||||
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
|
||||
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
|
||||
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
|
||||
s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
|
||||
s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
|
||||
|
||||
//FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?)
|
||||
s_mov_b32 s_save_m0, m0 //save M0
|
||||
|
||||
/* global mem offset */
|
||||
s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
|
||||
|
||||
|
||||
/* the first wave in the threadgroup */
|
||||
s_barrier //FIXME not performance-optimal "LDS is used? wait for other waves in the same TG"
|
||||
s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
|
||||
s_cbranch_scc0 L_SAVE_VGPR
|
||||
|
||||
/* save LDS */
|
||||
//////////////////////////////
|
||||
L_SAVE_LDS:
|
||||
|
||||
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
|
||||
s_mov_b32 exec_hi, 0xFFFFFFFF
|
||||
|
||||
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
|
||||
s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
|
||||
s_cbranch_scc0 L_SAVE_VGPR //no lds used? jump to L_SAVE_VGPR
|
||||
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
|
||||
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
|
||||
s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
s_mov_b32 m0, 0x0 //lds_offset initial value = 0
|
||||
|
||||
L_SAVE_LDS_LOOP:
|
||||
if (SAVE_LDS)
|
||||
buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1
|
||||
end
|
||||
s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes
|
||||
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //mem offset increased by 256 bytes
|
||||
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete?
|
||||
|
||||
|
||||
/* save VGPRs */
|
||||
//////////////////////////////
|
||||
L_SAVE_VGPR:
|
||||
|
||||
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
|
||||
s_mov_b32 exec_hi, 0xFFFFFFFF
|
||||
|
||||
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
|
||||
s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
|
||||
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
|
||||
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
|
||||
s_mov_b32 m0, 0x0 //VGPR initial index value =0
|
||||
s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
|
||||
s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later
|
||||
|
||||
L_SAVE_VGPR_LOOP:
|
||||
v_mov_b32 v0, v0 //v0 = v[0+m0]
|
||||
|
||||
if(USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
|
||||
else
|
||||
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
|
||||
end
|
||||
|
||||
s_add_u32 m0, m0, 1 //next vgpr index
|
||||
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes
|
||||
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
|
||||
s_set_gpr_idx_off
|
||||
|
||||
/* save SGPRs */
|
||||
//////////////////////////////
|
||||
s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
|
||||
s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
|
||||
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
|
||||
|
||||
if (SGPR_SAVE_USE_SQC)
|
||||
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes
|
||||
else
|
||||
s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
|
||||
end
|
||||
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
|
||||
s_mov_b32 m0, 0x0 //SGPR initial index value =0
|
||||
s_nop 0x0 //Manually inserted wait states
|
||||
|
||||
L_SAVE_SGPR_LOOP:
|
||||
s_movrels_b32 s0, s0 //s0 = s[0+m0]
|
||||
write_sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4
|
||||
s_add_u32 m0, m0, 1 //next sgpr index
|
||||
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete?
|
||||
|
||||
/* save HW registers */
|
||||
//////////////////////////////
|
||||
L_SAVE_HWREG:
|
||||
s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
|
||||
|
||||
write_sgpr_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0
|
||||
|
||||
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
|
||||
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
|
||||
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
|
||||
s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
|
||||
s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
|
||||
end
|
||||
|
||||
write_sgpr_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC
|
||||
write_sgpr_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
write_sgpr_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC
|
||||
write_sgpr_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
write_sgpr_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS
|
||||
|
||||
//s_save_trapsts conflicts with s_save_alloc_size
|
||||
s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
|
||||
write_sgpr_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS
|
||||
|
||||
write_sgpr_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO
|
||||
write_sgpr_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI
|
||||
|
||||
//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
|
||||
s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
|
||||
write_sgpr_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
|
||||
write_sgpr_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TBA_LO
|
||||
write_sgpr_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TBA_HI
|
||||
|
||||
/* S_PGM_END_SAVED */ //FIXME graphics ONLY
|
||||
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
|
||||
s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
|
||||
s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
|
||||
s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
|
||||
s_rfe_b64 s_save_pc_lo //Return to the main shader program
|
||||
else
|
||||
end
|
||||
|
||||
|
||||
s_branch L_END_PGM
|
||||
|
||||
|
||||
|
||||
/**************************************************************************/
|
||||
/* restore routine */
|
||||
/**************************************************************************/
|
||||
|
||||
L_RESTORE:
|
||||
/* Setup Resource Contants */
|
||||
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
|
||||
//calculate wd_addr using absolute thread id
|
||||
v_readlane_b32 s_restore_tmp, v9, 0
|
||||
s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
|
||||
s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
|
||||
s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
|
||||
s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
|
||||
s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
|
||||
else
|
||||
end
|
||||
|
||||
s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
|
||||
s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
|
||||
s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
|
||||
s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
|
||||
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
|
||||
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
|
||||
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
|
||||
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
|
||||
s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
|
||||
s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
|
||||
|
||||
/* global mem offset */
|
||||
s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
|
||||
|
||||
/* the first wave in the threadgroup */
|
||||
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
|
||||
s_cbranch_scc0 L_RESTORE_VGPR
|
||||
|
||||
/* restore LDS */
|
||||
//////////////////////////////
|
||||
L_RESTORE_LDS:
|
||||
|
||||
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
|
||||
s_mov_b32 exec_hi, 0xFFFFFFFF
|
||||
|
||||
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
|
||||
s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
|
||||
s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
|
||||
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
|
||||
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
|
||||
s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
s_mov_b32 m0, 0x0 //lds_offset initial value = 0
|
||||
|
||||
L_RESTORE_LDS_LOOP:
|
||||
if (SAVE_LDS)
|
||||
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
|
||||
end
|
||||
s_add_u32 m0, m0, 256 //every buffer_load_dword does 256 bytes
|
||||
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256 bytes
|
||||
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?
|
||||
|
||||
|
||||
/* restore VGPRs */
|
||||
//////////////////////////////
|
||||
L_RESTORE_VGPR:
|
||||
|
||||
s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
|
||||
s_mov_b32 exec_hi, 0xFFFFFFFF
|
||||
|
||||
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
|
||||
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
|
||||
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
|
||||
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
|
||||
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256
|
||||
s_mov_b32 m0, 1 //VGPR initial index value = 1
|
||||
s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
|
||||
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
|
||||
|
||||
L_RESTORE_VGPR_LOOP:
|
||||
if(USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
|
||||
else
|
||||
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
|
||||
end
|
||||
s_waitcnt vmcnt(0) //ensure data ready
|
||||
v_mov_b32 v0, v0 //v[0+m0] = v0
|
||||
s_add_u32 m0, m0, 1 //next vgpr index
|
||||
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes
|
||||
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete?
|
||||
s_set_gpr_idx_off
|
||||
/* VGPR restore on v0 */
|
||||
if(USE_MTBUF_INSTEAD_OF_MUBUF)
|
||||
tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
|
||||
else
|
||||
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
|
||||
end
|
||||
|
||||
|
||||
/* restore SGPRs */
|
||||
//////////////////////////////
|
||||
s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
|
||||
s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
|
||||
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
|
||||
|
||||
if (SGPR_SAVE_USE_SQC)
|
||||
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes
|
||||
else
|
||||
s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
|
||||
end
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
read_sgpr_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp
|
||||
s_mov_b32 m0, 0x1 //SGPR initial index value =1 //go on with with s1
|
||||
|
||||
L_RESTORE_SGPR_LOOP:
|
||||
read_sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made
|
||||
s_waitcnt lgkmcnt(0) //ensure data ready
|
||||
s_movreld_b32 s0, s0 //s[0+m0] = s0
|
||||
s_add_u32 m0, m0, 1 //next sgpr index
|
||||
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete?
|
||||
s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
|
||||
|
||||
/* restore HW registers */
|
||||
//////////////////////////////
|
||||
L_RESTORE_HWREG:
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
|
||||
if (SWIZZLE_EN)
|
||||
s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
|
||||
else
|
||||
s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
|
||||
end
|
||||
|
||||
read_sgpr_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
|
||||
read_sgpr_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
|
||||
read_sgpr_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
|
||||
read_sgpr_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
|
||||
read_sgpr_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
|
||||
read_sgpr_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
|
||||
read_sgpr_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
|
||||
read_sgpr_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
|
||||
read_sgpr_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
|
||||
read_sgpr_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
|
||||
read_sgpr_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TBA_LO
|
||||
read_sgpr_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TBA_HI
|
||||
|
||||
s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
|
||||
|
||||
s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
|
||||
|
||||
//for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
|
||||
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
|
||||
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
|
||||
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
|
||||
end
|
||||
if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
|
||||
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
|
||||
s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
|
||||
end
|
||||
|
||||
s_mov_b32 m0, s_restore_m0
|
||||
s_mov_b32 exec_lo, s_restore_exec_lo
|
||||
s_mov_b32 exec_hi, s_restore_exec_hi
|
||||
|
||||
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
|
||||
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
|
||||
s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
|
||||
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
|
||||
s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
|
||||
//s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
|
||||
s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
|
||||
//reuse s_restore_m0 as a temp register
|
||||
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
|
||||
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
|
||||
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
|
||||
s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
|
||||
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
|
||||
s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
|
||||
s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
|
||||
s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
|
||||
s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
|
||||
s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
|
||||
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
|
||||
s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
|
||||
s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status
|
||||
|
||||
s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
|
||||
|
||||
|
||||
// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
|
||||
s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc
|
||||
|
||||
|
||||
/**************************************************************************/
|
||||
/* the END */
|
||||
/**************************************************************************/
|
||||
L_END_PGM:
|
||||
s_endpgm
|
||||
|
||||
end
|
||||
|
||||
|
||||
/**************************************************************************/
|
||||
/* the helper functions */
|
||||
/**************************************************************************/
|
||||
|
||||
function write_sgpr_to_mem(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
|
||||
if (use_sqc)
|
||||
s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
|
||||
s_mov_b32 m0, s_mem_offset
|
||||
s_buffer_store_dword s, s_rsrc, m0 glc:1
|
||||
s_add_u32 s_mem_offset, s_mem_offset, 4
|
||||
s_mov_b32 m0, exec_lo
|
||||
elsif (use_mtbuf)
|
||||
v_mov_b32 v0, s
|
||||
tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
|
||||
s_add_u32 s_mem_offset, s_mem_offset, 256
|
||||
else
|
||||
v_mov_b32 v0, s
|
||||
buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
|
||||
s_add_u32 s_mem_offset, s_mem_offset, 256
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
function read_sgpr_from_mem(s, s_rsrc, s_mem_offset, use_sqc)
|
||||
s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
|
||||
if (use_sqc)
|
||||
s_add_u32 s_mem_offset, s_mem_offset, 4
|
||||
else
|
||||
s_add_u32 s_mem_offset, s_mem_offset, 256
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(4)
|
||||
v_mov_b32 v0, s0
|
||||
v_mov_b32 v1, s1
|
||||
v_mov_b32 v2, s2
|
||||
v_mov_b32 v3, s3
|
||||
flat_load_dword v4, v[0:1] slc
|
||||
s_waitcnt vmcnt(0)&lgkmcnt(0)
|
||||
v_mov_b32 v5, 0
|
||||
s_sleep 40000
|
||||
LOOP:
|
||||
v_add_co_u32 v5, vcc, 1, v5
|
||||
s_waitcnt vmcnt(0)&lgkmcnt(0)
|
||||
v_cmp_lt_u32 vcc, v5, v4
|
||||
s_cbranch_vccnz LOOP
|
||||
flat_store_dword v[2,3], v5
|
||||
s_waitcnt vmcnt(0)&lgkmcnt(0)
|
||||
s_endpgm
|
||||
end
|
||||
@@ -0,0 +1,69 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x80
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x100
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x100
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x80
|
||||
|
||||
s_waitcnt 0
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x100
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x100
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,131 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
|
||||
s_waitcnt 0
|
||||
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
s_mov_b32 s32, 0x4000
|
||||
v_mul_i32_i24 v9, v9, s32
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x10000
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 offen:1
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x10000
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
//write it to GDS
|
||||
s_mov_b32 s30, s8
|
||||
v_lshlrev_b32 v10, 2, v9
|
||||
s_mov_b32 m0, 0xFFFF
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
|
||||
STORE_LOOP:
|
||||
ds_write_b32 v10, v0 gds:1 // GPU hang when GPU access the GDS with GFX queue
|
||||
s_waitcnt 0
|
||||
v_add_u16 v10, v10, 0x10
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
|
||||
LOAD_LOOP:
|
||||
ds_read_b32 v11, v10 gds:1
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v11
|
||||
v_add_u16 v10, v10, 0x10
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read data from GDS
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
s_mov_b32 m0, 0xFFFF
|
||||
s_nop 1
|
||||
s_nop 1
|
||||
s_nop 1
|
||||
ds_read_b32 v11, v10 gds:1
|
||||
s_waitcnt 0
|
||||
|
||||
//write the data to memory
|
||||
buffer_store_dword v11, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
|
||||
user_sgpr_count(4)
|
||||
tgid_x_en(1)
|
||||
tgid_y_en(1)
|
||||
tgid_z_en(1)
|
||||
|
||||
s_getreg_b32 s18, hwreg(HW_REG_HW_ID, 0, 32)
|
||||
s_bfe_u32 s16, s18, 0x2001e // get meid
|
||||
s_bfe_u32 s17, s18, 0x20006 // get pipeid
|
||||
//s_add_u32 s17, s17, s16
|
||||
|
||||
// get ring id
|
||||
v_mov_b32 v20, s17
|
||||
s_and_b32 s17, s17, 0x7
|
||||
|
||||
// Get thread_id inside wave
|
||||
v_mbcnt_lo_u32_b32 v8, 0xffffffff, 0
|
||||
v_mbcnt_hi_u32_b32 v9, 0xffffffff, v8
|
||||
|
||||
s_waitcnt 0
|
||||
|
||||
// init: gds write address
|
||||
v_mov_b32 v13, 0
|
||||
|
||||
// the first 128DW is for ordered-append counter
|
||||
v_mov_b32 v14, 0x80
|
||||
|
||||
// offset ring
|
||||
v_mov_b32 v15, 0x200
|
||||
|
||||
v_mul_lo_u32 v15, v15, v20 // ring offset
|
||||
v_mov_b32 v16, 0x40 // wave_size
|
||||
|
||||
v_mul_lo_u32 v18, v1, s1
|
||||
v_add_co_u32 v18, vcc, v18, v0
|
||||
v_lshrrev_b32 v17,6 ,v18
|
||||
s_mov_b32 s9, s12
|
||||
s_lshr_b32 s9, s9, 6
|
||||
s_and_b32 s9, s9, 0x7ff
|
||||
s_lshl_b32 s17, s17, 18
|
||||
s_or_b32 s9, s9, s17
|
||||
s_mov_b32 m0, s9
|
||||
|
||||
v_mov_b32 v10, 1
|
||||
v_mov_b32 v11, 0
|
||||
ds_ordered_count v11, v10 gds:1 offset0:0 offset1:1
|
||||
s_waitcnt 0
|
||||
|
||||
v_mov_b32 v18, v11
|
||||
|
||||
v_mul_lo_u32 v16, v16, v18 // waves offset before.
|
||||
v_add_co_u32 v13, vcc, v13, v14
|
||||
v_add_co_u32 v13, vcc, v13, v15
|
||||
v_add_co_u32 v13, vcc, v13, v16
|
||||
v_add_co_u32 v13, vcc, v13, v9
|
||||
|
||||
v_lshlrev_b32 v13,2,v13
|
||||
s_mov_b32 m0, 0x4000
|
||||
s_nop 0
|
||||
ds_write_b32 v13, v0 gds:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
s_mov_b32 m0, 0xFFFF
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
s_nop 0x1
|
||||
|
||||
STORE_LOOP:
|
||||
ds_write_b32 v10, v0
|
||||
s_waitcnt 0
|
||||
v_add_u16 v10, v10, 0x10
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
|
||||
LOAD_LOOP:
|
||||
ds_read_b32 v11, v10
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v11
|
||||
v_add_u16 v10, v10, 0x10
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read it from LDS
|
||||
v_lshlrev_b32 v10, 2, v3
|
||||
s_mov_b32 m0, 0xFFFF
|
||||
s_nop 1
|
||||
s_nop 1
|
||||
s_nop 1
|
||||
ds_read_b32 v0, v10
|
||||
s_waitcnt 0
|
||||
|
||||
//write the data to memory
|
||||
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
//export poisoned data to L2
|
||||
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
//For vega20, we need to set bit 12 low. This bit will just be set low here in the shader.
|
||||
//s_mov_b32 s24, 0x15c000
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
//store it 10 times
|
||||
v_mov_b32 v10, v0
|
||||
v_mov_b32 v11, v0
|
||||
v_mov_b32 v12, v0
|
||||
v_mov_b32 v13, v0
|
||||
v_mov_b32 v14, v0
|
||||
v_mov_b32 v15, v0
|
||||
v_mov_b32 v16, v0
|
||||
v_mov_b32 v17, v0
|
||||
v_mov_b32 v18, v0
|
||||
v_mov_b32 v19, v0
|
||||
|
||||
// read them back
|
||||
v_mov_b32 v29, v10
|
||||
v_mov_b32 v28, v11
|
||||
v_mov_b32 v27, v12
|
||||
v_mov_b32 v26, v13
|
||||
v_mov_b32 v25, v14
|
||||
v_mov_b32 v24, v15
|
||||
v_mov_b32 v23, v16
|
||||
v_mov_b32 v22, v17
|
||||
v_mov_b32 v21, v18
|
||||
v_mov_b32 v20, v19
|
||||
|
||||
//export poisoned data to L2
|
||||
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,51 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
//export poisoned data to L2
|
||||
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//SPI may touch s0...sn before shader is run
|
||||
|
||||
s_mov_b32 s16, s2
|
||||
|
||||
//write data
|
||||
s_mov_b32 s30, s0
|
||||
s_mov_b32 s31, s1
|
||||
s_mov_b32 s32, s2
|
||||
s_mov_b32 s33, s3
|
||||
s_mov_b32 s34, s4
|
||||
s_mov_b32 s35, s5
|
||||
s_mov_b32 s36, s6
|
||||
s_mov_b32 s37, s7
|
||||
s_mov_b32 s38, s8
|
||||
s_mov_b32 s39, s9
|
||||
|
||||
//read back
|
||||
s_mov_b32 s0, s30
|
||||
s_mov_b32 s1, s31
|
||||
s_mov_b32 s2, s32
|
||||
s_mov_b32 s3, s33
|
||||
s_mov_b32 s4, s34
|
||||
s_mov_b32 s5, s35
|
||||
s_mov_b32 s6, s36
|
||||
s_mov_b32 s7, s37
|
||||
s_mov_b32 s8, s38
|
||||
s_mov_b32 s9, s39
|
||||
|
||||
s_store_dword s16, s[0:1], 0x0 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,75 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
|
||||
var MTYPE_UC = 0x38000000
|
||||
s_or_b32 s27, s27, MTYPE_UC
|
||||
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 m0, 0x0
|
||||
|
||||
|
||||
STORE_LOOP:
|
||||
s_buffer_store_dword s8, s[20:23], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_add_u32 m0, m0, 4*1024 // step one 4KB page table address
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
var DEBUG_FUNCTION = 0
|
||||
// Remove function check code to half shader run time...
|
||||
if DEBUG_FUNCTION
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 m0, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
s_buffer_load_dword s0, s[20:23], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_add_u32 m0, m0, 4*1024
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
end
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
/*
|
||||
s_bfe_u32 s33, s8, 0x20004 // extract bank select bits
|
||||
s_lshl_b32 s33, s33, 6 // ((bank_sel & 0x3) << 6) , bank_sel = address[9:8] ^ address[7:6], if 4 bank enabled
|
||||
s_and_b32 s8, s8, 0xf
|
||||
*/
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
|
||||
s_or_b32 s26, s26, 0x1000 //hack the buffer size to enough
|
||||
|
||||
STORE_LOOP:
|
||||
|
||||
var TOUCH_4_BANKS=1
|
||||
if TOUCH_4_BANKS
|
||||
s_mov_b32 m0, 0x0 // BANKA
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
s_mov_b32 m0, 0x40 // BANKB
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
/*
|
||||
s_mov_b32 m0, 0x80 // BANKC
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
s_mov_b32 m0, 0xC0 // BANKD
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
*/
|
||||
|
||||
end
|
||||
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_dcache_wb // to make emu, sim img match...
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
/*
|
||||
s_bfe_u32 s33, s8, 0x20004 // extract bank select bits
|
||||
s_lshl_b32 s33, s33, 6 // ((bank_sel & 0x3) << 6) , bank_sel = address[9:8] ^ address[7:6], if 4 bank enabled
|
||||
s_and_b32 s8, s8, 0xf
|
||||
*/
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
|
||||
s_or_b32 s26, s26, 0x1000 //hack the buffer size to enough
|
||||
|
||||
STORE_LOOP:
|
||||
|
||||
var TOUCH_4_BANKS=1
|
||||
if TOUCH_4_BANKS
|
||||
s_mov_b32 m0, 0x0 // BANKA
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
s_mov_b32 m0, 0x40 // BANKB
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
|
||||
/*
|
||||
s_mov_b32 m0, 0x80 // BANKC
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
s_mov_b32 m0, 0xC0 // BANKD
|
||||
s_buffer_store_dword s8, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
*/
|
||||
|
||||
end
|
||||
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_dcache_wb // to make emu, sim img match...
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,112 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
s_or_b32 s27, s27, 0x8000000 // changing mtype to non volatile
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
s_mov_b32 s9, 0xaa
|
||||
s_mov_b32 s10, 0xbb
|
||||
s_mov_b32 s11, 0xcc
|
||||
|
||||
// BUFFER STORE OFFSETS FOR BANK A AND BANKB
|
||||
s_mov_b32 s12, 0x0
|
||||
s_mov_b32 s13, 0x10
|
||||
s_mov_b32 s14, 0x40
|
||||
s_mov_b32 s15, 0x50
|
||||
|
||||
|
||||
// The following sequence is needed to inject error in dirty bit ram. Sequence was provided by SQC designer 4/1/2015
|
||||
//1. you have an invalid line in data cache,
|
||||
//2. you write to some of the dwords in that line (the remaining dwords are still invalid),
|
||||
//3. then there is a read request that hit on that line, but it needs the dwords that are not yet there in that line
|
||||
//(in other words, it needs some of the invalid dwords of that line),
|
||||
//4. the request will go to TC,
|
||||
//5. when TC return comes back, the dirty bit rm will be read
|
||||
|
||||
STORE_LOOP:
|
||||
|
||||
var TOUCH_4_BANKS=1
|
||||
if TOUCH_4_BANKS
|
||||
|
||||
s_mov_b32 m0, s13 // BANKA write one dword to tc
|
||||
s_buffer_store_dwordx2 s[8:9], s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
|
||||
s_mov_b32 m0, s12 // BANKA. write one dword to sqc
|
||||
s_buffer_store_dwordx2 s[10:11], s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
|
||||
s_mov_b32 m0, s13 // BANK A read the dword that is not in cache
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
|
||||
s_mov_b32 m0, s15 // BANKB write one dword to tc
|
||||
s_buffer_store_dwordx2 s[8:9], s[24:27], m0 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_mov_b32 m0, s14 // BANKB write one dword to sqc
|
||||
s_buffer_store_dwordx2 s[10:11], s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
|
||||
s_mov_b32 m0, s15 // BANK B read the dword that is not in cache
|
||||
s_buffer_load_dword s32, s[24:27], m0 glc:0
|
||||
s_waitcnt 0
|
||||
end
|
||||
|
||||
s_add_u32 s12, s12,0x80
|
||||
s_add_u32 s13, s13,0x80
|
||||
s_add_u32 s14, s14,0x80
|
||||
s_add_u32 s15, s15,0x80
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_dcache_wb // to make emu, sim img match...
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
|
||||
var MTYPE_UC = 0x38000000
|
||||
s_or_b32 s27, s27, MTYPE_UC
|
||||
|
||||
label inst_page[34+1] // 34 4k pages
|
||||
|
||||
for var i =0; i < 34; i++
|
||||
inst_page[i]:
|
||||
//each block are 4k side...
|
||||
s_cbranch_execnz inst_page[i+1] //1 dword
|
||||
for var j = 0; j < (4*1024)/4 -1; j++
|
||||
v_mov_b32 v0, 0 // each with 1 dword
|
||||
end
|
||||
|
||||
end
|
||||
inst_page[34]:
|
||||
|
||||
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
// don't care about the loop count, fix 8 loops
|
||||
// Totaly number of cacheline equals 2(A,B,)*8
|
||||
|
||||
var num_cache_lines = 16
|
||||
label BLOCK_64B[num_cache_lines]
|
||||
|
||||
|
||||
for var loop = 0; loop < num_cache_lines - 1; loop++
|
||||
BLOCK_64B[loop]:
|
||||
s_branch BLOCK_64B[loop+1] // 1DW
|
||||
for var i = 0; i < 15; i++
|
||||
v_nop
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
// last block
|
||||
for var i = 0; i < 15; i++
|
||||
v_nop
|
||||
end
|
||||
//For uei 2 msb and lsb flipped
|
||||
// s_nop will become v_nop and it will a legal instruction
|
||||
BLOCK_64B[num_cache_lines-1]:
|
||||
for var i = 0; i < 81; i++
|
||||
s_nop 0x1
|
||||
end
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
/** comment, four bank interleave
|
||||
Addr 0x90000000 => Bank A
|
||||
Addr 0x90000040 => Bank B
|
||||
Addr 0x90000080 => Bank C
|
||||
Addr 0x900000c0 => Bank D
|
||||
Addr 0x90000100 => Bank B
|
||||
Addr 0x90000140 => Bank A
|
||||
Addr 0x90000180 => Bank D
|
||||
Addr 0x900001c0 => Bank C
|
||||
Addr 0x90000200 => Bank C
|
||||
Addr 0x90000240 => Bank D
|
||||
Addr 0x90000280 => Bank A
|
||||
Addr 0x900002c0 => Bank B
|
||||
Addr 0x90000300 => Bank D
|
||||
Addr 0x90000340 => Bank C
|
||||
Addr 0x90000380 => Bank B
|
||||
|
||||
**/
|
||||
@@ -0,0 +1,29 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
for var i = 0; i < 1000; i++
|
||||
s_nop 0x1
|
||||
end
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,51 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read from memory
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
//write the data to memory
|
||||
buffer_store_dword v0, v9, s24, s7 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
s_mov_b32 s16, 0xa5a50001
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
//For vega20, we need to set bit 12 low to steer traffic to ea0
|
||||
s_mov_b32 s32, 0xFFFFEFFF
|
||||
s_and_b32 s24, s24, s32
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,345 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
//set bit 12 low to select EA0
|
||||
s_mov_b32 s32, 0xFFFFEFFF
|
||||
s_and_b32 s24, s24, s32
|
||||
|
||||
s_and_b32 s31, s9, 0x1
|
||||
s_cmpk_eq_i32 s31, 0x1
|
||||
s_cbranch_scc1 ODD_WAVES
|
||||
|
||||
//set bit 12 high to select EA1
|
||||
s_mov_b32 s32, 0x1000
|
||||
s_or_b32 s24, s24, s32
|
||||
|
||||
ODD_WAVES:
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
v_mul_i32_i24 v3, v3, 0x10
|
||||
v_mul_i32_i24 v9, v9, 0x10
|
||||
|
||||
s_mov_b32 s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
|
||||
s_barrier
|
||||
|
||||
s_mov_b32 s31, 0xF0000
|
||||
s_mov_b32 s32, 0x6000
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
|
||||
s_cmpk_lt_i32 s9, 0x2
|
||||
s_cbranch_scc1 ATOMIC_LOOP
|
||||
|
||||
LOAD_LOOP:
|
||||
|
||||
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
|
||||
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
|
||||
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
|
||||
buffer_load_dwordx4 v4, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v8, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v12, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v16, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v20, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v24, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v28, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v32, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v36, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
buffer_load_dwordx4 v40, v3, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x4000
|
||||
|
||||
s_sub_u32 s30, s30, 1
|
||||
s_cmpk_eq_u32 s30, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
|
||||
s_cmpk_ge_i32 s9, 0x2
|
||||
s_cbranch_scc1 END
|
||||
|
||||
ATOMIC_LOOP:
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
s_sub_u32 s30, s30, 1
|
||||
s_cmpk_eq_u32 s30, 0
|
||||
s_cbranch_scc0 ATOMIC_LOOP
|
||||
|
||||
//s_waitcnt 0
|
||||
|
||||
END:
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,509 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
//set bit 12 low to select EA0
|
||||
s_mov_b32 s32, 0xFFFFEFFF
|
||||
s_and_b32 s24, s24, s32
|
||||
|
||||
s_and_b32 s31, s9, 0x1
|
||||
s_cmpk_eq_i32 s31, 0x1
|
||||
s_cbranch_scc1 ODD_WAVES
|
||||
|
||||
//set bit 12 high to select EA1
|
||||
s_mov_b32 s32, 0x1000
|
||||
s_or_b32 s24, s24, s32
|
||||
|
||||
ODD_WAVES:
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
v_mul_i32_i24 v3, v3, 0x10
|
||||
v_mul_i32_i24 v9, v9, 0x10
|
||||
|
||||
s_mov_b32 s31, 0x9000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
buffer_store_dwordx4 v0, v9, s24, s31 offen:1
|
||||
s_add_i32 s31, s31, 0x6000
|
||||
|
||||
s_barrier
|
||||
|
||||
s_mov_b32 s31, 0xF0000
|
||||
s_mov_b32 s32, 0x9000
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
|
||||
s_cmpk_lt_i32 s9, 0x2
|
||||
s_cbranch_scc1 ATOMIC_LOOP
|
||||
|
||||
s_mov_b32 s20, 0x1
|
||||
|
||||
LOAD_LOOP:
|
||||
|
||||
s_atomic_add s20, s0, 0x100000
|
||||
s_atomic_add s20, s0, 0x100010
|
||||
s_atomic_add s20, s0, 0x100020
|
||||
s_atomic_add s20, s0, 0x100030
|
||||
s_atomic_add s20, s0, 0x100040
|
||||
s_atomic_add s20, s0, 0x100050
|
||||
s_atomic_add s20, s0, 0x100060
|
||||
s_atomic_add s20, s0, 0x100070
|
||||
s_atomic_add s20, s0, 0x100080
|
||||
s_atomic_add s20, s0, 0x100090
|
||||
|
||||
s_atomic_add s20, s0, 0x100100
|
||||
s_atomic_add s20, s0, 0x100110
|
||||
s_atomic_add s20, s0, 0x100120
|
||||
s_atomic_add s20, s0, 0x100130
|
||||
s_atomic_add s20, s0, 0x100140
|
||||
s_atomic_add s20, s0, 0x100150
|
||||
s_atomic_add s20, s0, 0x100160
|
||||
s_atomic_add s20, s0, 0x100170
|
||||
s_atomic_add s20, s0, 0x100180
|
||||
s_atomic_add s20, s0, 0x100190
|
||||
|
||||
s_atomic_add s20, s0, 0x100200
|
||||
s_atomic_add s20, s0, 0x100210
|
||||
s_atomic_add s20, s0, 0x100220
|
||||
s_atomic_add s20, s0, 0x100230
|
||||
s_atomic_add s20, s0, 0x100240
|
||||
s_atomic_add s20, s0, 0x100250
|
||||
s_atomic_add s20, s0, 0x100260
|
||||
s_atomic_add s20, s0, 0x100270
|
||||
s_atomic_add s20, s0, 0x100280
|
||||
s_atomic_add s20, s0, 0x100290
|
||||
|
||||
s_atomic_add s20, s0, 0x100300
|
||||
s_atomic_add s20, s0, 0x100310
|
||||
s_atomic_add s20, s0, 0x100320
|
||||
s_atomic_add s20, s0, 0x100330
|
||||
s_atomic_add s20, s0, 0x100340
|
||||
s_atomic_add s20, s0, 0x100350
|
||||
s_atomic_add s20, s0, 0x100360
|
||||
s_atomic_add s20, s0, 0x100370
|
||||
s_atomic_add s20, s0, 0x100380
|
||||
s_atomic_add s20, s0, 0x100390
|
||||
|
||||
s_atomic_add s20, s0, 0x100400
|
||||
s_atomic_add s20, s0, 0x100404
|
||||
s_atomic_add s20, s0, 0x100408
|
||||
s_atomic_add s20, s0, 0x10040c
|
||||
s_atomic_add s20, s0, 0x100410
|
||||
s_atomic_add s20, s0, 0x100414
|
||||
s_atomic_add s20, s0, 0x100418
|
||||
s_atomic_add s20, s0, 0x10041c
|
||||
s_atomic_add s20, s0, 0x100420
|
||||
s_atomic_add s20, s0, 0x100424
|
||||
s_atomic_add s20, s0, 0x100428
|
||||
s_atomic_add s20, s0, 0x10042c
|
||||
|
||||
s_atomic_add s20, s0, 0x100500
|
||||
s_atomic_add s20, s0, 0x100504
|
||||
s_atomic_add s20, s0, 0x100508
|
||||
s_atomic_add s20, s0, 0x10050c
|
||||
s_atomic_add s20, s0, 0x100510
|
||||
s_atomic_add s20, s0, 0x100514
|
||||
s_atomic_add s20, s0, 0x100518
|
||||
s_atomic_add s20, s0, 0x10051c
|
||||
s_atomic_add s20, s0, 0x100520
|
||||
s_atomic_add s20, s0, 0x100524
|
||||
s_atomic_add s20, s0, 0x100528
|
||||
s_atomic_add s20, s0, 0x10052c
|
||||
|
||||
s_atomic_add s20, s0, 0x100600
|
||||
s_atomic_add s20, s0, 0x100604
|
||||
s_atomic_add s20, s0, 0x100608
|
||||
s_atomic_add s20, s0, 0x10060c
|
||||
s_atomic_add s20, s0, 0x100610
|
||||
s_atomic_add s20, s0, 0x100614
|
||||
s_atomic_add s20, s0, 0x100618
|
||||
s_atomic_add s20, s0, 0x10061c
|
||||
s_atomic_add s20, s0, 0x100620
|
||||
s_atomic_add s20, s0, 0x100624
|
||||
s_atomic_add s20, s0, 0x100628
|
||||
s_atomic_add s20, s0, 0x10062c
|
||||
|
||||
s_atomic_add s20, s0, 0x100700
|
||||
s_atomic_add s20, s0, 0x100704
|
||||
s_atomic_add s20, s0, 0x100708
|
||||
s_atomic_add s20, s0, 0x10070c
|
||||
s_atomic_add s20, s0, 0x100710
|
||||
s_atomic_add s20, s0, 0x100714
|
||||
s_atomic_add s20, s0, 0x100718
|
||||
s_atomic_add s20, s0, 0x10071c
|
||||
s_atomic_add s20, s0, 0x100720
|
||||
s_atomic_add s20, s0, 0x100724
|
||||
s_atomic_add s20, s0, 0x100728
|
||||
s_atomic_add s20, s0, 0x10072c
|
||||
|
||||
s_atomic_add s20, s0, 0x100800
|
||||
s_atomic_add s20, s0, 0x100804
|
||||
s_atomic_add s20, s0, 0x100808
|
||||
s_atomic_add s20, s0, 0x10080c
|
||||
s_atomic_add s20, s0, 0x100810
|
||||
s_atomic_add s20, s0, 0x100814
|
||||
s_atomic_add s20, s0, 0x100818
|
||||
s_atomic_add s20, s0, 0x10081c
|
||||
s_atomic_add s20, s0, 0x100820
|
||||
s_atomic_add s20, s0, 0x100824
|
||||
s_atomic_add s20, s0, 0x100828
|
||||
s_atomic_add s20, s0, 0x10082c
|
||||
|
||||
s_atomic_add s20, s0, 0x100900
|
||||
s_atomic_add s20, s0, 0x100904
|
||||
s_atomic_add s20, s0, 0x100908
|
||||
s_atomic_add s20, s0, 0x10090c
|
||||
s_atomic_add s20, s0, 0x100910
|
||||
s_atomic_add s20, s0, 0x100914
|
||||
s_atomic_add s20, s0, 0x100918
|
||||
s_atomic_add s20, s0, 0x10091c
|
||||
s_atomic_add s20, s0, 0x100920
|
||||
s_atomic_add s20, s0, 0x100924
|
||||
s_atomic_add s20, s0, 0x100928
|
||||
s_atomic_add s20, s0, 0x10092c
|
||||
|
||||
s_atomic_add s20, s0, 0x100a00
|
||||
s_atomic_add s20, s0, 0x100a04
|
||||
s_atomic_add s20, s0, 0x100a08
|
||||
s_atomic_add s20, s0, 0x100a0c
|
||||
s_atomic_add s20, s0, 0x100a10
|
||||
s_atomic_add s20, s0, 0x100a14
|
||||
s_atomic_add s20, s0, 0x100a18
|
||||
s_atomic_add s20, s0, 0x100a1c
|
||||
s_atomic_add s20, s0, 0x100a20
|
||||
s_atomic_add s20, s0, 0x100a24
|
||||
s_atomic_add s20, s0, 0x100a28
|
||||
s_atomic_add s20, s0, 0x100a2c
|
||||
|
||||
s_atomic_add s20, s0, 0x100b00
|
||||
s_atomic_add s20, s0, 0x100b04
|
||||
s_atomic_add s20, s0, 0x100b08
|
||||
s_atomic_add s20, s0, 0x100b0c
|
||||
s_atomic_add s20, s0, 0x100b10
|
||||
s_atomic_add s20, s0, 0x100b14
|
||||
s_atomic_add s20, s0, 0x100b18
|
||||
s_atomic_add s20, s0, 0x100b1c
|
||||
s_atomic_add s20, s0, 0x100b20
|
||||
s_atomic_add s20, s0, 0x100b24
|
||||
s_atomic_add s20, s0, 0x100b28
|
||||
s_atomic_add s20, s0, 0x100b2c
|
||||
|
||||
s_atomic_add s20, s0, 0x100c00
|
||||
s_atomic_add s20, s0, 0x100c04
|
||||
s_atomic_add s20, s0, 0x100c08
|
||||
s_atomic_add s20, s0, 0x100c0c
|
||||
s_atomic_add s20, s0, 0x100c10
|
||||
s_atomic_add s20, s0, 0x100c14
|
||||
s_atomic_add s20, s0, 0x100c18
|
||||
s_atomic_add s20, s0, 0x100c1c
|
||||
s_atomic_add s20, s0, 0x100c20
|
||||
s_atomic_add s20, s0, 0x100c24
|
||||
s_atomic_add s20, s0, 0x100c28
|
||||
s_atomic_add s20, s0, 0x100c2c
|
||||
|
||||
|
||||
s_sub_u32 s30, s30, 1
|
||||
s_cmpk_eq_u32 s30, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
|
||||
s_cmpk_ge_i32 s9, 0x2
|
||||
s_cbranch_scc1 END
|
||||
|
||||
ATOMIC_LOOP:
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
buffer_atomic_add_x2 v0, v3, s24, s32 offen:1 glc:1
|
||||
s_add_i32 s32, s32, 0x2000
|
||||
|
||||
s_sub_u32 s30, s30, 1
|
||||
s_cmpk_eq_u32 s30, 0
|
||||
s_cbranch_scc0 ATOMIC_LOOP
|
||||
|
||||
//s_waitcnt 0
|
||||
|
||||
END:
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -0,0 +1,80 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
var MTYPE_UC = 0x38000000
|
||||
s_or_b32 s27, s27, MTYPE_UC
|
||||
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s8, 33 // store 33 times to overflow atcl1 cache...
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
STORE_LOOP:
|
||||
v_add_co_u32 v0, vcc[0:1], v0, 2
|
||||
buffer_store_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 4*1024 // step one 4KB page size
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
|
||||
|
||||
var DEBUG_FUNCTION = 0
|
||||
//remove code to half shader run time
|
||||
if DEBUG_FUNCTION
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 4*1024
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
end
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_load_dwordx4 s[20:23], s[0:1], 16 // load atc mem surface rsrc
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
var MTYPE_UC = 0x38000000
|
||||
s_or_b32 s27, s27, MTYPE_UC
|
||||
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s8, 33 // store 33 times to overflow atcl1 cache...
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
STORE_LOOP:
|
||||
v_add_co_u32 v0, vcc[0:1], v0, 2
|
||||
buffer_store_dword v0, v9, s20, s31 idxen:1 glc:1 slc:1
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 4*1024 // step one 4KB page size
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
|
||||
|
||||
var DEBUG_FUNCTION = 1
|
||||
//remove code to half shader run time
|
||||
if DEBUG_FUNCTION
|
||||
s_mov_b32 s8, 0x20
|
||||
s_mov_b32 s31, 0xffc
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dwordx2 v[0:1], v9, s20, s31 idxen:1 glc:1 slc:1
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 4*1024
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
end
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
//bump up the addresses being accessed to generate multiple reads to the pde memories
|
||||
v_mul_u32_u24 v9, 65536, v9
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
//Hack number of records to avoid range checking which we don't want since we want to generate
|
||||
//out of range accesses. we are really trying to generate many reads to the PDEs to get FUE.
|
||||
s_mov_b32 s26, 0xffffffff
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
//bump up the addresses being accessed to generate multiple reads to the pde memories
|
||||
v_mul_u32_u24 v9, 4096, v9
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
//Hack number of records to avoid range checking which we don't want since we want to generate
|
||||
//out of range accesses. we are really trying to generate many reads to the PDEs to get FUE.
|
||||
s_mov_b32 s26, 0xffffffff
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_mov_b32 s8, s30
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
LOAD_LOOP:
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
v_mov_b32 v12, v0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 LOAD_LOOP
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(8) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s8
|
||||
tgid_y_en(1) //s_tgid_y s9
|
||||
tgid_z_en(1) //s_tgid_z s10
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//read mem data
|
||||
s_mov_b32 s31, 0x0
|
||||
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
//fetch the buffer resource through SQC
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
// v3 thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0
|
||||
v_mad_u32_u24 v3, v2, s3, v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5
|
||||
s_add_i32 s28, s28, s_tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group
|
||||
v_mov_b32 v9, s28
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//store and load s8 times
|
||||
s_mov_b32 s30, s8
|
||||
s_mov_b32 s31, 0x0
|
||||
|
||||
STORE_LOOP:
|
||||
buffer_store_dword v0, v9, s24, s31 idxen:1 glc:0
|
||||
s_waitcnt 0
|
||||
s_add_u32 s31, s31, 0x4
|
||||
s_sub_u32 s8, s8, 1
|
||||
s_cmpk_eq_u32 s8, 0
|
||||
s_cbranch_scc0 STORE_LOOP
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
@@ -0,0 +1,54 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(9) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
//s2 x
|
||||
//s3 x*y
|
||||
//s4 x*y*z
|
||||
//s5 X
|
||||
//s6 X*Y
|
||||
//s7 output offset
|
||||
//s8 loop
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s9
|
||||
tgid_y_en(1) //s_tgid_y s10
|
||||
tgid_z_en(1) //s_tgid_z s11
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
s_mov_b32 s16, s2
|
||||
|
||||
//SPI may touch v0,v1,v2 before shader is run
|
||||
|
||||
//store it 10 times
|
||||
v_mov_b32 v10, v1
|
||||
v_mov_b32 v11, v2
|
||||
v_mov_b32 v12, v1
|
||||
v_mov_b32 v13, v2
|
||||
v_mov_b32 v14, v1
|
||||
v_mov_b32 v15, v2
|
||||
v_mov_b32 v16, v1
|
||||
v_mov_b32 v17, v2
|
||||
v_mov_b32 v18, v1
|
||||
v_mov_b32 v19, v0
|
||||
|
||||
// read them back
|
||||
v_mov_b32 v29, v10
|
||||
v_mov_b32 v28, v11
|
||||
v_mov_b32 v27, v12
|
||||
v_mov_b32 v26, v13
|
||||
v_mov_b32 v25, v14
|
||||
v_mov_b32 v24, v15
|
||||
v_mov_b32 v23, v16
|
||||
v_mov_b32 v22, v17
|
||||
v_mov_b32 v21, v18
|
||||
v_mov_b32 v20, v19
|
||||
|
||||
s_store_dword s16, s[0:1], 0x0 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
@@ -0,0 +1,75 @@
|
||||
shader main
|
||||
type(CS)
|
||||
|
||||
user_sgpr_count(2) // 2 for the buffer resource + 5 for thread/thread group parameters
|
||||
//s[0:1] the mmeory address for the buffer resource
|
||||
|
||||
tgid_x_en(1) //s_tgid_x s2
|
||||
tgid_y_en(1) //s_tgid_y s3
|
||||
tgid_z_en(1) //s_tgid_z s4
|
||||
|
||||
//vo for tid_x
|
||||
//v1 for tid_y
|
||||
//v2 for tid_z
|
||||
|
||||
for var vgpr = 0; vgpr < 256; ++vgpr
|
||||
v_accvgpr_read v[vgpr], acc[vgpr]
|
||||
end
|
||||
|
||||
for var vgpr = 0; vgpr < 256; ++vgpr
|
||||
v_accvgpr_write acc[vgpr], v[vgpr]
|
||||
end
|
||||
|
||||
s_movk_i32 m0, 0x0000
|
||||
s_mov_b32 s10, 0x000000f8
|
||||
s_set_gpr_idx_on s10, 0x8
|
||||
label_0004:
|
||||
v_mov_b32 v0, 0
|
||||
v_mov_b32 v1, 0
|
||||
v_mov_b32 v2, 0
|
||||
v_mov_b32 v3, 0
|
||||
v_mov_b32 v4, 0
|
||||
v_mov_b32 v5, 0
|
||||
v_mov_b32 v6, 0
|
||||
v_mov_b32 v7, 0
|
||||
s_sub_u32 s10, s10, 8
|
||||
s_set_gpr_idx_idx s10
|
||||
s_cbranch_scc0 label_0004
|
||||
s_set_gpr_idx_off
|
||||
v_mbcnt_lo_u32_b32 v1, exec_hi, 0
|
||||
v_mbcnt_hi_u32_b32 v1, exec_lo, v1
|
||||
v_mul_u32_u24 v1, 8, v1
|
||||
s_getreg_b32 s11, hwreg(HW_REG_HW_ID, 4, 2)
|
||||
s_mulk_i32 s11, 0x4000
|
||||
v_add_co_u32 v1, vcc, v1, s11
|
||||
s_mov_b32 s10, 7
|
||||
s_mov_b32 m0, -1
|
||||
label_001B:
|
||||
ds_write2_b64 v1, v[2:3], v[2:3] offset1:64
|
||||
ds_write2_b64 v1, v[4:5], v[4:5] offset0:128 offset1:192
|
||||
v_add_co_u32 v1, vcc, 0x00000800, v1
|
||||
s_sub_u32 s10, s10, 1
|
||||
s_cbranch_scc0 label_001B
|
||||
|
||||
s_getreg_b32 s20, hwreg(HW_REG_HW_ID, 0, 32)
|
||||
// s12 = SIMD
|
||||
s_lshr_b32 s12,s20,4
|
||||
s_and_b32 s12, s12, 0x3
|
||||
// s13 = CU
|
||||
s_lshr_b32 s13,s20,8
|
||||
s_and_b32 s13, s13, 0xf
|
||||
// s14 = SE
|
||||
s_lshr_b32 s14,s20,13
|
||||
s_and_b32 s14, s14, 0x7
|
||||
// s15 = SE * 16 * 4 + CU * 4 + SIMD
|
||||
s_mul_i32 s16, s14, 64
|
||||
s_mul_i32 s17, s13, 4
|
||||
s_add_i32 s15, s16, s17
|
||||
s_add_i32 s15, s15, s12
|
||||
s_mul_i32 s16, s15, 4
|
||||
|
||||
s_store_dword s15, s[0:1], s16 glc
|
||||
s_waitcnt 0
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
@@ -0,0 +1,58 @@
|
||||
//s[0:1]: buffer resource
|
||||
//s2: num_threads_x_full
|
||||
//s3: num_threads_x_full * num_threads_y_full
|
||||
//s4: num_threads_x_full * num_threads_y_full * num_threads_z_full
|
||||
//s5: COMPUTE_DIM_X
|
||||
//s6: COMPUTE_DIM_X * COMPUTE_DIM_Y
|
||||
//s7: loop_lifetime
|
||||
//s8: dispatch_offset
|
||||
//s[9:11]: thread group ID
|
||||
//v[0:2]: thread ID
|
||||
|
||||
shader main
|
||||
|
||||
type(CS)
|
||||
user_sgpr_count(9)
|
||||
tgid_x_en(1)
|
||||
tgid_y_en(1)
|
||||
tgid_z_en(1)
|
||||
|
||||
//sp3 loop for lifetime
|
||||
s_mov_b32 s12, 0 //init loop idx s12
|
||||
label_0004:
|
||||
s_cmp_lt_i32 s12, s7 //scc = (s12 < s7) ? 1 : 0
|
||||
s_cbranch_scc0 label_0006 //if(scc == 0) then jump to label_0006; else nop
|
||||
|
||||
v_mov_b32 v4,s12
|
||||
s_add_i32 s12, s12, 1 //add loop incr
|
||||
s_branch label_0004
|
||||
|
||||
label_0006: //end of SP3 loop
|
||||
|
||||
//v3 thread_id_in_group = (tid_z * num_threads_x_full * num_threads_y_full) + (tid_y * num_threads_x_full) + tid_x
|
||||
v_mad_u32_u24 v3, v1, s2, v0 //v3 = tid_y * num_threads_x_full + tid_x
|
||||
v_mad_u32_u24 v3, v2, s3, v3 //v3 = tid_z * num_threads_x_ful * num_threads_y_full + v3
|
||||
|
||||
//s28 thread_group_id = (tgid_z * COMPUTE_DIM_X * COMPUTE_DIM_Y) + (tgid_y * COMPUTE_DIM_X) + tgid_x
|
||||
s_mul_i32 s28, s_tgid_y, s5 //tgid_y * COMPUTE_DIM_X
|
||||
s_add_i32 s28, s28, s_tgid_x //tgid_y * COMPUTE_DIM_X + tgid_x
|
||||
s_mul_i32 s29, s6, s_tgid_z //tgid_z * COMPUTE_DIM_X * COMPUTE_DIM_Y
|
||||
s_add_i32 s28, s29, s28
|
||||
|
||||
//v9 absolute thread id = thread_group_id * (num_threads_x_full * num_threads_y_full * num_threads_z_full) + thread_id_in_group
|
||||
v_mov_b32 v9, s28 //thread_group_id
|
||||
v_mad_u32_u24 v9, v9, s4, v3
|
||||
|
||||
//fetch the buffer resource
|
||||
s_load_dwordx4 s[24:27], s[0:1], 0x0
|
||||
s_waitcnt 0
|
||||
|
||||
//write absolute thread id using it as an index
|
||||
buffer_store_dword v9, v9, s24, s8 idxen:1
|
||||
s_waitcnt 0
|
||||
|
||||
s_mov_b32 s16, 0xa5a50000
|
||||
s_store_dword s16, s[0:1], 0x40 glc
|
||||
|
||||
s_endpgm
|
||||
end
|
||||
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичные данные
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Ссылка в новой задаче
Block a user