From 467d8d1aea2f49ea020addc3eb082a536407702f Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 22 Nov 2016 15:26:00 -0600 Subject: [PATCH] added fast math intrinsics to HIP 1. Added fast math intrinsics for single precision data types 2. Added test to check the intrinsics 3. Added HIP_PRECISE_MATH macro to enable precise math on fast math Change-Id: Iadacbb6182c31252c5e3252854372d1b80dfd27b [ROCm/hip commit: f843928ddd2797ba93715a13b3601e978e18651d] --- .../hip/include/hip/hcc_detail/hip_runtime.h | 205 ++++++++++++++++-- projects/hip/src/device_util.cpp | 163 ++++++++++++-- .../hip/tests/src/deviceLib/hipFloatMath.cpp | 61 ++++++ 3 files changed, 391 insertions(+), 38 deletions(-) create mode 100644 projects/hip/tests/src/deviceLib/hipFloatMath.cpp diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h index 079d681c39..4b781b44ae 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h @@ -479,26 +479,193 @@ __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr(); //TODO - add a couple fast math operations here, the set here will grow : -__device__ float __expf(float x); -__device__ float __frsqrt_rn(float x); -__device__ float __fsqrt_rd(float x); -__device__ float __fsqrt_rn(float x); -__device__ float __fsqrt_ru(float x); -__device__ float __fsqrt_rz(float x); -__device__ float __log10f(float x); -//__device__ float __log2f(float x); -__device__ float __logf(float x); -__device__ float __powf(float base, float exponent); -__device__ void __sincosf(float x, float *s, float *c) ; -extern __attribute__((const)) float __sinf(float) __asm("llvm.sin.f32"); -extern __attribute__((const)) float __cosf(float) __asm("llvm.cos.f32"); -extern __attribute__((const)) float __log2f(float) __asm("llvm.log2.f32"); -__device__ float __tanf(float x); -__device__ float __dsqrt_rd(double x); -__device__ float __dsqrt_rn(double x); -__device__ float __dsqrt_ru(double x); -__device__ float __dsqrt_rz(double x); +// Single Precision Precise Math +__device__ float __hip_precise_cosf(float); +__device__ float __hip_precise_exp10f(float); +__device__ float __hip_precise_expf(float); +__device__ float __hip_precise_frsqrt_rn(float); +__device__ float __hip_precise_fsqrt_rd(float); +__device__ float __hip_precise_fsqrt_rn(float); +__device__ float __hip_precise_fsqrt_ru(float); +__device__ float __hip_precise_fsqrt_rz(float); +__device__ float __hip_precise_log10f(float); +__device__ float __hip_precise_log2f(float); +__device__ float __hip_precise_logf(float); +__device__ float __hip_precise_powf(float, float); +__device__ void __hip_precise_sincosf(float,float*,float*); +__device__ float __hip_precise_sinf(float); +__device__ float __hip_precise_tanf(float); +// Double Precision Precise Math +__device__ double __hip_precise_dsqrt_rd(double); +__device__ double __hip_precise_dsqrt_rn(double); +__device__ double __hip_precise_dsqrt_ru(double); +__device__ double __hip_precise_dsqrt_rz(double); + +// Single Precision Fast Math +extern __attribute__((const)) float __hip_fast_cosf(float) __asm("llvm.cos.f32"); +extern __attribute__((const)) float __hip_fast_exp2f(float) __asm("llvm.exp2.f32"); +__device__ float __hip_fast_exp10f(float); +__device__ float __hip_fast_expf(float); +__device__ float __hip_fast_frsqrt_rn(float); +extern __attribute__((const)) float __hip_fast_fsqrt_rd(float) __asm("llvm.sqrt.f32"); +__device__ float __hip_fast_fsqrt_rn(float); +__device__ float __hip_fast_fsqrt_ru(float); +__device__ float __hip_fast_fsqrt_rz(float); +__device__ float __hip_fast_log10f(float); +extern __attribute__((const)) float __hip_fast_log2f(float) __asm("llvm.log2.f32"); +__device__ float __hip_fast_logf(float); +__device__ float __hip_fast_powf(float, float); +__device__ void __hip_fast_sincosf(float,float*,float*); +extern __attribute__((const)) float __hip_fast_sinf(float) __asm("llvm.sin.f32"); +__device__ float __hip_fast_tanf(float); + +#ifdef HIP_PRECISE_MATH +// Single Precision Precise Math when enabled + +__device__ inline float __cosf(float x) { + return __hip_precise_cosf(x); +} + +__device__ inline float __exp10f(float x) { + return __hip_precise_exp10f(x); +} + +__device__ inline float __expf(float x) { + return __hip_precise_expf(x); +} + +__device__ inline float __frsqrt_rn(float x) { + return __hip_precise_frsqrt_rn(x); +} + +__device__ inline float __fsqrt_rd(float x) { + return __hip_precise_fsqrt_rd(x); +} + +__device__ inline float __fsqrt_rn(float x) { + return __hip_precise_fsqrt_rn(x); +} + +__device__ inline float __fsqrt_ru(float x) { + return __hip_precise_fsqrt_ru(x); +} + +__device__ inline float __fsqrt_rz(float x) { + return __hip_precise_fsqrt_rz(x); +} + +__device__ inline float __log10f(float x) { + return __hip_precise_log10f(x); +} + +__device__ inline float __log2f(float x) { + return __hip_precise_log2f(x); +} + +__device__ inline float __logf(float x) { + return __hip_precise_logf(x); +} + +__device__ inline float __powf(float base, float exponent) { + return __hip_precise_powf(base, exponent); +} + +__device__ inline void __sincosf(float x, float *s, float *c) { + return __hip_precise_sincosf(x, s, c); +} + +__device__ inline float __sinf(float x) { + return __hip_precise_sinf(x); +} + +__device__ inline float __tanf(float x) { + return __hip_precise_tanf(x); +} + +// Double Precision + +__device__ double __dsqrt_rd(double x) { + return __hip_precise_dsqrt_rd(x); +} + +__device__ double __dsqrt_rn(double x) { + return __hip_precise_dsqrt_rn(x); +} + +__device__ double __dsqrt_ru(double x) { + return __hip_precise_dsqrt_ru(x); +} + +__device__ double __dsqrt_rz(double x) { + return __hip_precise_dsqrt_rz(x); +} + +#else + +// Single Precision Fast Math +__device__ inline float __cosf(float x) { + return __hip_fast_cosf(x); +} + +__device__ inline float __exp10f(float x) { + return __hip_fast_exp10f(x); +} + +__device__ inline float __expf(float x) { + return __hip_fast_expf(x); +} + +__device__ inline float __frsqrt_rn(float x) { + return __hip_fast_frsqrt_rn(x); +} + +__device__ inline float __fsqrt_rd(float x) { + return __hip_fast_fsqrt_rd(x); +} + +__device__ inline float __fsqrt_rn(float x) { + return __hip_fast_fsqrt_rn(x); +} + +__device__ inline float __fsqrt_ru(float x) { + return __hip_fast_fsqrt_ru(x); +} + +__device__ inline float __fsqrt_rz(float x) { + return __hip_fast_fsqrt_rz(x); +} + +__device__ inline float __log10f(float x) { + return __hip_fast_log10f(x); +} + +__device__ inline float __log2f(float x) { + return __hip_fast_log2f(x); +} + +__device__ inline float __logf(float x) { + return __hip_fast_logf(x); +} + +__device__ inline float __powf(float base, float exponent) { + return __hip_fast_powf(base, exponent); +} + +__device__ inline void __sincosf(float x, float *s, float *c) { + return __hip_fast_sincosf(x, s, c); +} + +__device__ inline float __sinf(float x) { + return __hip_fast_sinf(x); +} + +__device__ inline float __tanf(float x) { + return __hip_fast_tanf(x); +} + + +#endif /** * CUDA 8 device function features diff --git a/projects/hip/src/device_util.cpp b/projects/hip/src/device_util.cpp index fc966f3395..203e7a7826 100644 --- a/projects/hip/src/device_util.cpp +++ b/projects/hip/src/device_util.cpp @@ -2043,26 +2043,151 @@ __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr() } +// Precise Math Functions +__device__ float __hip_precise_cosf(float x) { + return hc::precise_math::cosf(x); +} -//TODO - add a couple fast math operations here, the set here will grow : -//__device__ float __cosf(float x) {return hc::fast_math::cosf(x); }; -__device__ float __expf(float x) {return hc::fast_math::expf(x); }; -__device__ float __frsqrt_rn(float x) {return hc::fast_math::rsqrt(x); }; -__device__ float __fsqrt_rd(float x) {return hc::fast_math::sqrt(x); }; -__device__ float __fsqrt_rn(float x) {return hc::fast_math::sqrt(x); }; -__device__ float __fsqrt_ru(float x) {return hc::fast_math::sqrt(x); }; -__device__ float __fsqrt_rz(float x) {return hc::fast_math::sqrt(x); }; -__device__ float __log10f(float x) {return hc::fast_math::log10f(x); }; -//__device__ float __log2f(float x) {return hc::fast_math::log2f(x); }; -__device__ float __logf(float x) {return hc::fast_math::logf(x); }; -__device__ float __powf(float base, float exponent) {return hc::fast_math::powf(base, exponent); }; -__device__ void __sincosf(float x, float *s, float *c) { *s = __sinf(x); *c = __cosf(x); }; -//__device__ float __sinf(float x) {return hc::fast_math::sinf(x); }; -__device__ float __tanf(float x) {return __sinf(x)/__cosf(x); }; -__device__ float __dsqrt_rd(double x) {return hc::fast_math::sqrt(x); }; -__device__ float __dsqrt_rn(double x) {return hc::fast_math::sqrt(x); }; -__device__ float __dsqrt_ru(double x) {return hc::fast_math::sqrt(x); }; -__device__ float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); }; +__device__ float __hip_precise_exp10f(float x) { + return hc::precise_math::exp10f(x); +} + +__device__ float __hip_precise_expf(float x) { + return hc::precise_math::expf(x); +} + +__device__ float __hip_precise_frsqrt_rn(float x) { + return hc::precise_math::rsqrt(x); +} + +__device__ float __hip_precise_fsqrt_rd(float x) { + return hc::precise_math::sqrt(x); +} + +__device__ float __hip_precise_fsqrt_rn(float x) { + return hc::precise_math::sqrt(x); +} + +__device__ float __hip_precise_fsqrt_ru(float x) { + return hc::precise_math::sqrt(x); +} + +__device__ float __hip_precise_fsqrt_rz(float x) { + return hc::precise_math::sqrt(x); +} + +__device__ float __hip_precise_log10f(float x) { + return hc::precise_math::log10(x); +} + +__device__ float __hip_precise_log2f(float x) { + return hc::precise_math::log2(x); +} + +__device__ float __hip_precise_logf(float x) { + return hc::precise_math::logf(x); +} + +__device__ float __hip_precise_powf(float base, float exponent) { + return hc::precise_math::powf(base, exponent); +} + +__device__ void __hip_precise_sincosf(float x, float *s, float *c) { + hc::precise_math::sincosf(x, s, c); +} + +__device__ float __hip_precise_sinf(float x) { + return hc::precise_math::sinf(x); +} + +__device__ float __hip_precise_tanf(float x) { + return hc::precise_math::tanf(x); +} + +// Double Precision Math +__device__ double __hip_precise_dsqrt_rd(double x) { + return hc::precise_math::sqrt(x); +} + +__device__ double __hip_precise_dsqrt_rn(double x) { + return hc::precise_math::sqrt(x); +} + +__device__ double __hip_precise_dsqrt_ru(double x) { + return hc::precise_math::sqrt(x); +} + +__device__ double __hip_precise_dsqrt_rz(double x) { + return hc::precise_math::sqrt(x); +} + +#define LOG_BASE2_E_DIV_2 0.4426950408894701 +#define LOG_BASE2_5 2.321928094887362 +#define ONE_DIV_LOG_BASE2_E 0.69314718056 +#define ONE_DIV_LOG_BASE2_10 0.30102999566 + +// Fast Math Intrinsics +__device__ float __hip_fast_exp10f(float x) { + return __hip_fast_exp2f(x*LOG_BASE2_E_DIV_2); +} + +__device__ float __hip_fast_expf(float x) { + return __hip_fast_expf(x*LOG_BASE2_5); +} + +__device__ float __hip_fast_frsqrt_rn(float x) { + return 1 / __hip_fast_fsqrt_rd(x);; +} + +__device__ float __hip_fast_fsqrt_rn(float x) { + return __hip_fast_fsqrt_rd(x); +} + +__device__ float __hip_fast_fsqrt_ru(float x) { + return __hip_fast_fsqrt_rd(x); +} + +__device__ float __hip_fast_fsqrt_rz(float x) { + return __hip_fast_fsqrt_rd(x); +} + +__device__ float __hip_fast_log10f(float x) { + return ONE_DIV_LOG_BASE2_E * __hip_fast_log2f(x); +} + +__device__ float __hip_fast_logf(float x) { + return ONE_DIV_LOG_BASE2_10 * __hip_fast_log2f(x); +} + +__device__ float __hip_fast_powf(float base, float exponent) { + return hc::fast_math::powf(base, exponent); +} + +__device__ void __hip_fast_sincosf(float x, float *s, float *c) { + *s = __hip_fast_sinf(x); + *c = __hip_fast_cosf(x); +} + +__device__ float __hip_fast_tanf(float x) { + return hc::fast_math::tanf(x); +} + +// Double Precision Math +__device__ double __hip_fast_dsqrt_rd(double x) { + return hc::fast_math::sqrt(x); +} + +__device__ double __hip_fast_dsqrt_rn(double x) { + return hc::fast_math::sqrt(x); +} + +__device__ double __hip_fast_dsqrt_ru(double x) { + return hc::fast_math::sqrt(x); +} + +__device__ double __hip_fast_dsqrt_rz(double x) { + return hc::fast_math::sqrt(x); +} __HIP_DEVICE__ char1 make_char1(signed char x) { diff --git a/projects/hip/tests/src/deviceLib/hipFloatMath.cpp b/projects/hip/tests/src/deviceLib/hipFloatMath.cpp new file mode 100644 index 0000000000..eb70eb6b0b --- /dev/null +++ b/projects/hip/tests/src/deviceLib/hipFloatMath.cpp @@ -0,0 +1,61 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s + * RUN: %t + * HIT_END + */ + +#include "test_common.h" + +#define LEN 512 +#define SIZE LEN<<2 + + + +__global__ void floatMath(hipLaunchParm lp, float *In, float *Out) { + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + Out[tid] = __cosf(In[tid]); + Out[tid] = __exp10f(Out[tid]); + Out[tid] = __expf(Out[tid]); + Out[tid] = __frsqrt_rn(Out[tid]); + Out[tid] = __fsqrt_rd(Out[tid]); + Out[tid] = __fsqrt_rn(Out[tid]); + Out[tid] = __fsqrt_ru(Out[tid]); + Out[tid] = __fsqrt_rz(Out[tid]); + Out[tid] = __log10f(Out[tid]); + Out[tid] = __log2f(Out[tid]); + Out[tid] = __logf(Out[tid]); + Out[tid] = __powf(2.0f, Out[tid]); + __sincosf(Out[tid], &In[tid], &Out[tid]); + Out[tid] = __sinf(Out[tid]); + Out[tid] = __cosf(Out[tid]); + Out[tid] = __tanf(Out[tid]); +} + +int main(){ + float *Inh, *Outh, *Ind, *Outd; + hipMalloc((void**)&Ind, SIZE); + hipMalloc((void**)&Outd, SIZE); + hipLaunchKernel(floatMath, dim3(LEN,1,1), dim3(1,1,1), 0, 0, Ind, Outd); +}