Files
Evgeny 7c4369bde4 Initial Commit
Contributors:
Ammar ELWazir <aelwazir@amd.com>
AravindanC <aravindan.cheruvally@amd.com>
Benjamin Welton <bewelton@amd.com>
Ma, Bing <Bing.Ma@amd.com>
Chun Yang <chun.yang@amd.com>
Cole Nelson <cole.nelson@amd.com>
Ethan Stewart <ethan.stewart@amd.com>
Evgeny <evgeny.shcherbakov@amd.com>
Freddy Paul <Freddy.paul@amd.com>
Giovanni Baraldi <gbaraldi@amd.com>
Gopesh Bhardwaj <Gopesh.Bhardwaj@amd.com>
Icarus Sparry <icarus.sparry@amd.com>
itrowbri <Ian.Trowbridge@amd.com>
James Edwards <JamesAdrian.Edwards@amd.com>
jatang <jatang@amd.com>
Jeremy Newton <Jeremy.Newton@amd.com>
Jonathan Kim <jonathan.kim@amd.com>
Kent Russell <kent.russell@amd.com>
Kiumars Sabeti <kiumars.sabeti@amd.com>
Lang Yu <lang.yu@amd.com>
Laurent Morichetti <laurent.morichetti@amd.com>
Mallya, Ameya Keshava <AmeyaKeshava.Mallya@amd.com>
Manjunath Jakaraddi <manjunath.jakaraddi@amd.com>
Mark Laws <markdavid.laws@amd.com>
Mohan Kumar Mithur <Mohan.KumarMithur@amd.com>
Nicholas Curtis <nicurtis@amd.com>
Nirmal Unnikrishnan <Nirmal.Unnikrishnan@amd.com>
Parag Bhandari <parag.bhandari@amd.com>
Ranjith Ramakrishnan <Ranjith.Ramakrishnan@amd.com>
Robert Gregory <Robert.Gregory@amd.com>
Saravanan Solaiyappan <saravanan.solaiyappan@amd.com>
Saurabh Verma <saurabh.verma@amd.com>
Srihari Uttanur <srihari.u@amd.com>
Srinivasan Subramanian <srinivasan.subramanian@amd.com>
Sriraksha Nagaraj <Sriraksha.Nagaraj@amd.com>
Sushma Vaddireddy <svaddire@amd.com>
Xianwei Zhang <Xianwei.Zhang@amd.com>


[ROCm/aqlprofile commit: 1ed169e30c]
2025-05-28 10:10:47 -05:00

155 خطوط
4.2 KiB
Plaintext

module &m:1:0:$full:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
prog kernel &__OpenCL_simple_convolution(kernarg_u64 %__global_offset_0,
kernarg_u64 %output,
kernarg_u64 %input,
kernarg_u64 %mask,
kernarg_u32 %inputDimensions[2],
kernarg_u32 %maskDimensions[2]) {
pragma "AMD RTI", "ARGSTART:__OpenCL_simple_convolution";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "memory:private:0";
pragma "AMD RTI", "memory:region:0";
pragma "AMD RTI", "memory:local:0";
pragma "AMD RTI", "value:__global_offset_0:u64:1:1:0";
pragma "AMD RTI", "pointer:output:u32:1:1:96:uav:7:4:RW:0:0:0";
pragma "AMD RTI", "pointer:input:u32:1:1:112:uav:7:4:RW:0:0:0";
pragma "AMD RTI", "pointer:mask:float:1:1:128:uav:7:4:RW:0:0:0";
pragma "AMD RTI", "value:inputDimensions:u32:2:1:144";
pragma "AMD RTI", "constarg:4:inputDimensions";
pragma "AMD RTI", "value:maskDimensions:u32:2:1:160";
pragma "AMD RTI", "constarg:5:maskDimensions";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "enqueue_kernel:0";
pragma "AMD RTI", "kernel_index:0";
pragma "AMD RTI", "reflection:0:size_t";
pragma "AMD RTI", "reflection:1:uint*";
pragma "AMD RTI", "reflection:2:uint*";
pragma "AMD RTI", "reflection:3:float*";
pragma "AMD RTI", "reflection:4:uint2";
pragma "AMD RTI", "reflection:5:uint2";
pragma "AMD RTI", "ARGEND:__OpenCL_simple_convolution";
@__OpenCL_simple_convolution_Entry:
// BB#0: // %entry
workitemabsid_u32 $s6, 0;
cvt_u64_u32 $d0, $s6;
ld_kernarg_align(8)_width(all)_u64 $d4, [%__global_offset_0];
add_u64 $d0, $d0, $d4;
cvt_u32_u64 $s5, $d0;
ld_v2_kernarg_align(4)_width(all)_u32 ($s0, $s4), [%inputDimensions];
ld_v2_kernarg_align(4)_width(all)_u32 ($s1, $s9), [%maskDimensions];
rem_u32 $s7, $s5, $s0;
add_u32 $s2, $s1, 4294967295;
shr_u32 $s8, $s2, 1;
add_u32 $s2, $s7, $s8;
add_u32 $s3, $s0, 4294967295;
cmp_ge_b1_u32 $c0, $s2, $s0;
cmov_b32 $s2, $c0, $s3, $s2;
sub_u32 $s3, $s7, $s8;
cmp_lt_b1_u32 $c0, $s7, $s8;
cmov_b32 $s3, $c0, 0, $s3;
ld_kernarg_align(8)_width(all)_u64 $d1, [%output];
cmp_le_b1_u32 $c0, $s3, $s2;
cbr_b1 $c0, @BB0_2;
// BB#1:
mov_b32 $s6, 0;
br @BB0_6;
// @BB0_2: // %for.cond32.preheader.lr.ph
@BB0_2:
div_u32 $s5, $s5, $s0;
add_u32 $s9, $s9, 4294967295;
shr_u32 $s9, $s9, 1;
add_u32 $s10, $s5, $s9;
add_u32 $s11, $s4, 4294967295;
cmp_ge_b1_u32 $c0, $s10, $s4;
cmov_b32 $s4, $c0, $s11, $s10;
sub_u32 $s10, $s5, $s9;
cmp_lt_b1_u32 $c0, $s5, $s9;
cmov_b32 $s5, $c0, 0, $s10;
ld_kernarg_align(8)_width(all)_u64 $d2, [%mask];
ld_kernarg_align(8)_width(all)_u64 $d3, [%input];
cvt_u64_u32 $d5, $s6;
add_u64 $d4, $d4, $d5;
cvt_u32_u64 $s6, $d4;
div_u32 $s6, $s6, $s0;
max_u32 $s10, $s9, $s6;
sub_u32 $s12, $s10, $s6;
max_u32 $s11, $s7, $s8;
mov_b32 $s6, 0;
mad_u32 $s12, $s1, $s12, $s11;
sub_u32 $s7, $s12, $s7;
sub_u32 $s9, $s10, $s9;
mad_u32 $s9, $s0, $s9, $s11;
sub_u32 $s8, $s9, $s8;
// @BB0_3: // %for.cond32.preheader
@BB0_3:
cmp_gt_b1_u32 $c0, $s5, $s4;
mov_b32 $s9, $s7;
mov_b32 $s10, $s8;
mov_b32 $s11, $s5;
cbr_b1 $c0, @BB0_5;
// @BB0_4: // %for.body35
@BB0_4:
cvt_u64_u32 $d4, $s9;
shl_u64 $d4, $d4, 2;
add_u64 $d4, $d2, $d4;
ld_global_align(4)_f32 $s12, [$d4];
cvt_u64_u32 $d4, $s10;
shl_u64 $d4, $d4, 2;
add_u64 $d4, $d3, $d4;
ld_global_align(4)_u32 $s13, [$d4];
cvt_f32_u32 $s13, $s13;
mul_ftz_f32 $s12, $s13, $s12;
add_u32 $s9, $s9, $s1;
add_u32 $s10, $s10, $s0;
add_u32 $s11, $s11, 1;
add_ftz_f32 $s6, $s6, $s12;
cmp_le_b1_u32 $c0, $s11, $s4;
cbr_b1 $c0, @BB0_4;
// @BB0_5: // %for.inc48
@BB0_5:
add_u32 $s7, $s7, 1;
add_u32 $s8, $s8, 1;
add_u32 $s3, $s3, 1;
cmp_le_b1_u32 $c0, $s3, $s2;
cbr_b1 $c0, @BB0_3;
// @BB0_6: // %for.end50
@BB0_6:
and_b64 $d0, $d0, 4294967295;
shl_u64 $d0, $d0, 2;
add_u64 $d0, $d1, $d0;
add_ftz_f32 $s0, $s6, 0F3f000000;
cvt_ftz_u32_f32 $s0, $s0;
st_global_align(4)_u32 $s0, [$d0];
ret;
};