From 5ef8ef3bd7b95cba058dbce318e8b98c677bbf29 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 12 Jan 2017 14:10:51 -0600 Subject: [PATCH] added packed math fp16 native device functions 1. Added SDWA implementation inside IR file 2. Added device functions to header + used them in test Change-Id: Ib4e059a58eee201cc82438689e3e9bc5f9d26653 --- include/hip/hcc_detail/hip_fp16.h | 71 +++++++++++++++++++++++++++++ src/hip_ir.ll | 23 ++++++++++ tests/src/deviceLib/hipTestHalf.cpp | 12 +++++ 3 files changed, 106 insertions(+) diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h index c779bcfba2..2ef8d330e7 100644 --- a/include/hip/hcc_detail/hip_fp16.h +++ b/include/hip/hcc_detail/hip_fp16.h @@ -41,6 +41,11 @@ extern "C" __half __hip_hc_ir_hfma_half(__half, __half, __half); extern "C" __half __hip_hc_ir_hmul_half(__half, __half); extern "C" __half __hip_hc_ir_hsub_half(__half, __half); +extern "C" int __hip_hc_ir_hadd2_int(int, int); +extern "C" int __hip_hc_ir_hfma2_int(int, int, int); +extern "C" int __hip_hc_ir_hmul2_int(int, int); +extern "C" int __hip_hc_ir_hsub2_int(int, int); + __device__ static inline __half __hadd(const __half a, const __half b) { return __hip_hc_ir_hadd_half(a, b); } @@ -81,6 +86,72 @@ __device__ static inline __half hdiv(__half a, __half b) { return a/b; } +/* + Half2 Arithmetic Functions +*/ + +__device__ static inline __half2 __hadd2(__half2 a, __half2 b) { + __half2 c; + c.q = __hip_hc_ir_hadd2_int(a.q, b.q); + return c; +} + +__device__ static inline __half2 __hadd2_sat(__half2 a, __half2 b) { + __half2 c; + c.q = __hip_hc_ir_hadd2_int(a.q, b.q); + return c; +} + +__device__ static inline __half2 __hfma2(__half2 a, __half2 b, __half2 c) { + __half2 d; + d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q); + return d; +} + +__device__ static inline __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c) { + __half2 d; + d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q); + return d; +} + +__device__ static inline __half2 __hmul2(__half2 a, __half2 b) { + __half2 c; + c.q = __hip_hc_ir_hmul2_int(a.q, b.q); + return c; +} + +__device__ static inline __half2 __hmul2_sat(__half2 a, __half2 b) { + __half2 c; + c.q = __hip_hc_ir_hmul2_int(a.q, b.q); + return c; +} + +__device__ static inline __half2 __hsub2(__half2 a, __half2 b) { + __half2 c; + c.q = __hip_hc_ir_hsub2_int(a.q, b.q); + return c; +} + +__device__ static inline __half2 __hneg2(__half2 a) { + __half2 c; + c.p[0] = - a.p[0]; + c.p[1] = - a.p[1]; + return c; +} + +__device__ static inline __half2 __hsub2_sat(__half2 a, __half2 b) { + __half2 c; + c.q = __hip_hc_ir_hsub2_int(a.q, b.q); + return c; +} + +__device__ static inline __half2 h2div(__half2 a, __half2 b) { + __half2 c; + c.p[0] = a.p[0] / b.p[0]; + c.p[1] = a.p[1] / b.p[1]; + return c; +} + #endif #if __clang_major__ == 3 diff --git a/src/hip_ir.ll b/src/hip_ir.ll index 202bf9f215..52460a38bb 100644 --- a/src/hip_ir.ll +++ b/src/hip_ir.ll @@ -61,6 +61,29 @@ define half @__hip_hc_ir_hfma_half(half %a, half %b, half %c) #1 { ret half %9 } +define i32 @__hip_hc_ir_hadd2_int(i32 %a, i32 %b) #1 { + %1 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) + tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) + ret i32 %1 +} +define i32 @__hip_hc_ir_hfma2_int(i32 %a, i32 %b, i32 %c) #1 { + %1 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c) + tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) + tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %1, i32 %c) + ret i32 %1 +} + +define i32 @__hip_hc_ir_hmul2_int(i32 %a, i32 %b) #1 { + %1 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) + tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) + ret i32 %1 +} + +define i32 @__hip_hc_ir_hsub2_int(i32 %a, i32 %b) #1 { + %1 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) + tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) + ret i32 %1 +} attributes #1 = { alwaysinline nounwind } diff --git a/tests/src/deviceLib/hipTestHalf.cpp b/tests/src/deviceLib/hipTestHalf.cpp index 9533bf34ca..2c01c5cb72 100644 --- a/tests/src/deviceLib/hipTestHalf.cpp +++ b/tests/src/deviceLib/hipTestHalf.cpp @@ -69,6 +69,18 @@ __global__ void CheckHalf(hipLaunchParm lp, __half* In1, __half* In2, __half* In Out[9] = hdiv(In1[9], In2[9]); } +__global__ void CheckHalf2(hipLaunchParm lp, __half2* In1, __half2* In2, __half2* In3, __half2* Out){ + Out[0] = __hadd2(In1[0], In2[0]); + Out[1] = __hadd2_sat(In1[1], In2[1]); + Out[2] = __hfma2(In1[2], In2[2],In3[2]); + Out[3] = __hfma2_sat(In1[3], In2[3], In3[3]); + Out[4] = __hmul2(In1[4], In2[4]); + Out[5] = __hmul2_sat(In1[5], In2[5]); + Out[6] = __hneg2(In1[6]); + Out[7] = __hsub2(In1[7], In2[7]); + Out[8] = __hsub2_sat(In1[8], In2[8]); + Out[9] = h2div(In1[9], In2[9]); +} int main(){