diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h index a34b75d1a6..db6c161c42 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h @@ -781,6 +781,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const return (__bf16)a + (__bf16)b; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH + * \brief Adds two bfloat16 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd_rn(const __hip_bfloat16 a, + const __hip_bfloat16 b) { +#pragma clang fp contract(off) + return (__bf16)a + (__bf16)b; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Subtracts two bfloat16 values @@ -789,6 +799,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const return (__bf16)a - (__bf16)b; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH + * \brief Subtracts two bfloat16 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub_rn(const __hip_bfloat16 a, + const __hip_bfloat16 b) { +#pragma clang fp contract(off) + return (__bf16)a - (__bf16)b; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Divides two bfloat16 values @@ -815,6 +835,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const return (__bf16)a * (__bf16)b; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH + * \brief Multiplies two bfloat16 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul_rn(const __hip_bfloat16 a, + const __hip_bfloat16 b) { +#pragma clang fp contract(off) + return (__bf16)a * (__bf16)b; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Negate a bfloat16 value @@ -861,6 +891,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a, return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)}; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH + * \brief Adds two bfloat162 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2_rn(const __hip_bfloat162 a, + const __hip_bfloat162 b) { +#pragma clang fp contract(off) + return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)}; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Performs FMA of given bfloat162 values @@ -879,6 +919,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a, return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)}; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH + * \brief Multiplies two bfloat162 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2_rn(const __hip_bfloat162 a, + const __hip_bfloat162 b) { +#pragma clang fp contract(off) + return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)}; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Converts a bfloat162 into negative @@ -896,6 +946,16 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a, return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)}; } +/** + * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH + * \brief Subtracts two bfloat162 values, will not fuse into fma + */ +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2_rn(const __hip_bfloat162 a, + const __hip_bfloat162 b) { +#pragma clang fp contract(off) + return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)}; +} + /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to multiply two __hip_bfloat16 numbers diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h index 814575ea1e..f53c22cb83 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h @@ -1367,6 +1367,15 @@ THE SOFTWARE. return __half_raw{ static_cast<__half_raw>(x).data + static_cast<__half_raw>(y).data}; + } + inline + __HOST_DEVICE__ + __half __hadd_rn(__half x, __half y) + { +#pragma clang fp contract(off) + return __half_raw{ + static_cast<__half_raw>(x).data + + static_cast<__half_raw>(y).data}; } inline __HOST_DEVICE__ @@ -1390,6 +1399,15 @@ THE SOFTWARE. } inline __HOST_DEVICE__ + __half __hsub_rn(__half x, __half y) + { +#pragma clang fp contract(off) + return __half_raw{ + static_cast<__half_raw>(x).data - + static_cast<__half_raw>(y).data}; + } + inline + __HOST_DEVICE__ __half __hmul(__half x, __half y) { return __half_raw{ @@ -1398,6 +1416,15 @@ THE SOFTWARE. } inline __HOST_DEVICE__ + __half __hmul_rn(__half x, __half y) + { +#pragma clang fp contract(off) + return __half_raw{ + static_cast<__half_raw>(x).data * + static_cast<__half_raw>(y).data}; + } + inline + __HOST_DEVICE__ __half __hadd_sat(__half x, __half y) { return __clamp_01(__hadd(x, y)); @@ -1446,6 +1473,16 @@ THE SOFTWARE. static_cast<__half2_raw>(x).data + static_cast<__half2_raw>(y).data}; } + inline + __HOST_DEVICE__ + __half2 __hadd2_rn(__half2 x, __half2 y) + { +#pragma clang fp contract(off) + return __half2{ + static_cast<__half2_raw>(x).data + + static_cast<__half2_raw>(y).data}; + } + inline __HOST_DEVICE__ __half2 __habs2(__half2 x) @@ -1462,6 +1499,15 @@ THE SOFTWARE. } inline __HOST_DEVICE__ + __half2 __hsub2_rn(__half2 x, __half2 y) + { +#pragma clang fp contract(off) + return __half2{ + static_cast<__half2_raw>(x).data - + static_cast<__half2_raw>(y).data}; + } + inline + __HOST_DEVICE__ __half2 __hmul2(__half2 x, __half2 y) { return __half2{ @@ -1470,6 +1516,15 @@ THE SOFTWARE. } inline __HOST_DEVICE__ + __half2 __hmul2_rn(__half2 x, __half2 y) + { +#pragma clang fp contract(off) + return __half2{ + static_cast<__half2_raw>(x).data * + static_cast<__half2_raw>(y).data}; + } + inline + __HOST_DEVICE__ __half2 __hadd2_sat(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(__hadd2(x, y));