From 467d8d1aea2f49ea020addc3eb082a536407702f Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Tue, 22 Nov 2016 15:26:00 -0600
Subject: [PATCH] added fast math intrinsics to HIP 1. Added fast math
 intrinsics for single precision data types 2. Added test to check the
 intrinsics 3. Added HIP_PRECISE_MATH macro to enable precise math on fast
 math

Change-Id: Iadacbb6182c31252c5e3252854372d1b80dfd27b


[ROCm/hip commit: f843928ddd2797ba93715a13b3601e978e18651d]
---
 .../hip/include/hip/hcc_detail/hip_runtime.h  | 205 ++++++++++++++++--
 projects/hip/src/device_util.cpp              | 163 ++++++++++++--
 .../hip/tests/src/deviceLib/hipFloatMath.cpp  |  61 ++++++
 3 files changed, 391 insertions(+), 38 deletions(-)
 create mode 100644 projects/hip/tests/src/deviceLib/hipFloatMath.cpp

diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h
index 079d681c39..4b781b44ae 100644
--- a/projects/hip/include/hip/hcc_detail/hip_runtime.h
+++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h
@@ -479,26 +479,193 @@ __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr();
 
 //TODO - add a couple fast math operations here, the set here will grow :
 
-__device__ float __expf(float x);
-__device__ float __frsqrt_rn(float x);
-__device__ float __fsqrt_rd(float x);
-__device__ float __fsqrt_rn(float x);
-__device__ float __fsqrt_ru(float x);
-__device__ float __fsqrt_rz(float x);
-__device__ float __log10f(float x);
-//__device__ float __log2f(float x);
-__device__ float __logf(float x);
-__device__ float __powf(float base, float exponent);
-__device__ void __sincosf(float x, float *s, float *c) ;
-extern __attribute__((const)) float __sinf(float) __asm("llvm.sin.f32");
-extern __attribute__((const)) float __cosf(float) __asm("llvm.cos.f32");
-extern __attribute__((const)) float __log2f(float) __asm("llvm.log2.f32");
-__device__ float __tanf(float x);
-__device__ float __dsqrt_rd(double x);
-__device__ float __dsqrt_rn(double x);
-__device__ float __dsqrt_ru(double x);
-__device__ float __dsqrt_rz(double x);
+// Single Precision Precise Math
+__device__ float __hip_precise_cosf(float);
+__device__ float __hip_precise_exp10f(float);
+__device__ float __hip_precise_expf(float);
+__device__ float __hip_precise_frsqrt_rn(float);
+__device__ float __hip_precise_fsqrt_rd(float);
+__device__ float __hip_precise_fsqrt_rn(float);
+__device__ float __hip_precise_fsqrt_ru(float);
+__device__ float __hip_precise_fsqrt_rz(float);
+__device__ float __hip_precise_log10f(float);
+__device__ float __hip_precise_log2f(float);
+__device__ float __hip_precise_logf(float);
+__device__ float __hip_precise_powf(float, float);
+__device__ void __hip_precise_sincosf(float,float*,float*);
+__device__ float __hip_precise_sinf(float);
+__device__ float __hip_precise_tanf(float);
 
+// Double Precision Precise Math
+__device__ double __hip_precise_dsqrt_rd(double);
+__device__ double __hip_precise_dsqrt_rn(double);
+__device__ double __hip_precise_dsqrt_ru(double);
+__device__ double __hip_precise_dsqrt_rz(double);
+
+// Single Precision Fast Math
+extern __attribute__((const)) float __hip_fast_cosf(float) __asm("llvm.cos.f32");
+extern __attribute__((const)) float __hip_fast_exp2f(float) __asm("llvm.exp2.f32");
+__device__ float __hip_fast_exp10f(float);
+__device__ float __hip_fast_expf(float);
+__device__ float __hip_fast_frsqrt_rn(float);
+extern __attribute__((const)) float __hip_fast_fsqrt_rd(float) __asm("llvm.sqrt.f32");
+__device__ float __hip_fast_fsqrt_rn(float);
+__device__ float __hip_fast_fsqrt_ru(float);
+__device__ float __hip_fast_fsqrt_rz(float);
+__device__ float __hip_fast_log10f(float);
+extern __attribute__((const)) float __hip_fast_log2f(float) __asm("llvm.log2.f32");
+__device__ float __hip_fast_logf(float);
+__device__ float __hip_fast_powf(float, float);
+__device__ void __hip_fast_sincosf(float,float*,float*);
+extern __attribute__((const)) float __hip_fast_sinf(float) __asm("llvm.sin.f32");
+__device__ float __hip_fast_tanf(float);
+
+#ifdef HIP_PRECISE_MATH
+// Single Precision Precise Math when enabled
+
+__device__ inline float __cosf(float x) {
+  return __hip_precise_cosf(x);
+}
+
+__device__ inline float __exp10f(float x) {
+  return __hip_precise_exp10f(x);
+}
+
+__device__ inline float __expf(float x) {
+  return __hip_precise_expf(x);
+}
+
+__device__ inline float __frsqrt_rn(float x) {
+  return __hip_precise_frsqrt_rn(x);
+}
+
+__device__ inline float __fsqrt_rd(float x) {
+  return __hip_precise_fsqrt_rd(x);
+}
+
+__device__ inline float __fsqrt_rn(float x) {
+  return __hip_precise_fsqrt_rn(x);
+}
+
+__device__ inline float __fsqrt_ru(float x) {
+  return __hip_precise_fsqrt_ru(x);
+}
+
+__device__ inline float __fsqrt_rz(float x) {
+  return __hip_precise_fsqrt_rz(x);
+}
+
+__device__ inline float __log10f(float x) {
+  return __hip_precise_log10f(x);
+}
+
+__device__ inline float __log2f(float x) {
+  return __hip_precise_log2f(x);
+}
+
+__device__ inline float __logf(float x) {
+  return __hip_precise_logf(x);
+}
+
+__device__ inline float __powf(float base, float exponent) {
+  return __hip_precise_powf(base, exponent);
+}
+
+__device__ inline void __sincosf(float x, float *s, float *c) {
+  return __hip_precise_sincosf(x, s, c);
+}
+
+__device__ inline float __sinf(float x) {
+  return __hip_precise_sinf(x);
+}
+
+__device__ inline float __tanf(float x) {
+  return __hip_precise_tanf(x);
+}
+
+// Double Precision
+
+__device__ double __dsqrt_rd(double x) {
+  return __hip_precise_dsqrt_rd(x);
+}
+
+__device__ double __dsqrt_rn(double x) {
+  return __hip_precise_dsqrt_rn(x);
+}
+
+__device__ double __dsqrt_ru(double x) {
+  return __hip_precise_dsqrt_ru(x);
+}
+
+__device__ double __dsqrt_rz(double x) {
+  return __hip_precise_dsqrt_rz(x);
+}
+
+#else
+
+// Single Precision Fast Math
+__device__ inline float __cosf(float x) {
+  return __hip_fast_cosf(x);
+}
+
+__device__ inline float __exp10f(float x) {
+  return __hip_fast_exp10f(x);
+}
+
+__device__ inline float __expf(float x) {
+  return __hip_fast_expf(x);
+}
+
+__device__ inline float __frsqrt_rn(float x) {
+  return __hip_fast_frsqrt_rn(x);
+}
+
+__device__ inline float __fsqrt_rd(float x) {
+  return __hip_fast_fsqrt_rd(x);
+}
+
+__device__ inline float __fsqrt_rn(float x) {
+  return __hip_fast_fsqrt_rn(x);
+}
+
+__device__ inline float __fsqrt_ru(float x) {
+  return __hip_fast_fsqrt_ru(x);
+}
+
+__device__ inline float __fsqrt_rz(float x) {
+  return __hip_fast_fsqrt_rz(x);
+}
+
+__device__ inline float __log10f(float x) {
+  return __hip_fast_log10f(x);
+}
+
+__device__ inline float __log2f(float x) {
+  return __hip_fast_log2f(x);
+}
+
+__device__ inline float __logf(float x) {
+  return __hip_fast_logf(x);
+}
+
+__device__ inline float __powf(float base, float exponent) {
+  return __hip_fast_powf(base, exponent);
+}
+
+__device__ inline void __sincosf(float x, float *s, float *c) {
+  return __hip_fast_sincosf(x, s, c);
+}
+
+__device__ inline float __sinf(float x) {
+  return __hip_fast_sinf(x);
+}
+
+__device__ inline float __tanf(float x) {
+  return __hip_fast_tanf(x);
+}
+
+
+#endif
 /**
  * CUDA 8 device function features
 
diff --git a/projects/hip/src/device_util.cpp b/projects/hip/src/device_util.cpp
index fc966f3395..203e7a7826 100644
--- a/projects/hip/src/device_util.cpp
+++ b/projects/hip/src/device_util.cpp
@@ -2043,26 +2043,151 @@ __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr()
 }
 
 
+// Precise Math Functions
+__device__ float __hip_precise_cosf(float x) {
+  return hc::precise_math::cosf(x);
+}
 
-//TODO - add a couple fast math operations here, the set here will grow :
-//__device__  float __cosf(float x) {return hc::fast_math::cosf(x); };
-__device__  float __expf(float x) {return hc::fast_math::expf(x); };
-__device__  float __frsqrt_rn(float x) {return hc::fast_math::rsqrt(x); };
-__device__  float __fsqrt_rd(float x) {return hc::fast_math::sqrt(x); };
-__device__  float __fsqrt_rn(float x) {return hc::fast_math::sqrt(x); };
-__device__  float __fsqrt_ru(float x) {return hc::fast_math::sqrt(x); };
-__device__  float __fsqrt_rz(float x) {return hc::fast_math::sqrt(x); };
-__device__  float __log10f(float x) {return hc::fast_math::log10f(x); };
-//__device__  float __log2f(float x) {return hc::fast_math::log2f(x); };
-__device__  float __logf(float x) {return hc::fast_math::logf(x); };
-__device__  float __powf(float base, float exponent) {return hc::fast_math::powf(base, exponent); };
-__device__  void __sincosf(float x, float *s, float *c) { *s = __sinf(x); *c = __cosf(x); };
-//__device__  float __sinf(float x) {return hc::fast_math::sinf(x); };
-__device__  float __tanf(float x) {return __sinf(x)/__cosf(x); };
-__device__  float __dsqrt_rd(double x) {return hc::fast_math::sqrt(x); };
-__device__  float __dsqrt_rn(double x) {return hc::fast_math::sqrt(x); };
-__device__  float __dsqrt_ru(double x) {return hc::fast_math::sqrt(x); };
-__device__  float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); };
+__device__ float __hip_precise_exp10f(float x) {
+  return hc::precise_math::exp10f(x);
+}
+
+__device__ float __hip_precise_expf(float x) {
+  return hc::precise_math::expf(x);
+}
+
+__device__ float __hip_precise_frsqrt_rn(float x) {
+  return hc::precise_math::rsqrt(x);
+}
+
+__device__ float __hip_precise_fsqrt_rd(float x) {
+  return hc::precise_math::sqrt(x);
+}
+
+__device__ float __hip_precise_fsqrt_rn(float x) {
+  return hc::precise_math::sqrt(x);
+}
+
+__device__ float __hip_precise_fsqrt_ru(float x) {
+  return hc::precise_math::sqrt(x);
+}
+
+__device__ float __hip_precise_fsqrt_rz(float x) {
+  return hc::precise_math::sqrt(x);
+}
+
+__device__ float __hip_precise_log10f(float x) {
+  return hc::precise_math::log10(x);
+}
+
+__device__ float __hip_precise_log2f(float x) {
+  return hc::precise_math::log2(x);
+}
+
+__device__ float __hip_precise_logf(float x) {
+  return hc::precise_math::logf(x);
+}
+
+__device__ float __hip_precise_powf(float base, float exponent) {
+  return hc::precise_math::powf(base, exponent);
+}
+
+__device__ void __hip_precise_sincosf(float x, float *s, float *c) {
+  hc::precise_math::sincosf(x, s, c);
+}
+
+__device__ float __hip_precise_sinf(float x) {
+  return hc::precise_math::sinf(x);
+}
+
+__device__ float __hip_precise_tanf(float x) {
+  return hc::precise_math::tanf(x);
+}
+
+// Double Precision Math
+__device__ double __hip_precise_dsqrt_rd(double x) {
+  return hc::precise_math::sqrt(x);
+}
+
+__device__ double __hip_precise_dsqrt_rn(double x) {
+  return hc::precise_math::sqrt(x);
+}
+
+__device__ double __hip_precise_dsqrt_ru(double x) {
+  return hc::precise_math::sqrt(x);
+}
+
+__device__ double __hip_precise_dsqrt_rz(double x) {
+  return hc::precise_math::sqrt(x);
+}
+
+#define LOG_BASE2_E_DIV_2 0.4426950408894701
+#define LOG_BASE2_5 2.321928094887362
+#define ONE_DIV_LOG_BASE2_E 0.69314718056
+#define ONE_DIV_LOG_BASE2_10 0.30102999566
+
+// Fast Math Intrinsics
+__device__ float __hip_fast_exp10f(float x) {
+  return __hip_fast_exp2f(x*LOG_BASE2_E_DIV_2);
+}
+
+__device__ float __hip_fast_expf(float x) {
+  return __hip_fast_expf(x*LOG_BASE2_5);
+}
+
+__device__ float __hip_fast_frsqrt_rn(float x) {
+  return 1 / __hip_fast_fsqrt_rd(x);;
+}
+
+__device__ float __hip_fast_fsqrt_rn(float x) {
+  return __hip_fast_fsqrt_rd(x);
+}
+
+__device__ float __hip_fast_fsqrt_ru(float x) {
+  return __hip_fast_fsqrt_rd(x);
+}
+
+__device__ float __hip_fast_fsqrt_rz(float x) {
+  return __hip_fast_fsqrt_rd(x);
+}
+
+__device__ float __hip_fast_log10f(float x) {
+  return ONE_DIV_LOG_BASE2_E * __hip_fast_log2f(x);
+}
+
+__device__ float __hip_fast_logf(float x) {
+  return ONE_DIV_LOG_BASE2_10 * __hip_fast_log2f(x);
+}
+
+__device__ float __hip_fast_powf(float base, float exponent) {
+  return hc::fast_math::powf(base, exponent);
+}
+
+__device__ void __hip_fast_sincosf(float x, float *s, float *c) {
+  *s = __hip_fast_sinf(x);
+  *c = __hip_fast_cosf(x);
+}
+
+__device__ float __hip_fast_tanf(float x) {
+  return hc::fast_math::tanf(x);
+}
+
+// Double Precision Math
+__device__ double __hip_fast_dsqrt_rd(double x) {
+  return hc::fast_math::sqrt(x);
+}
+
+__device__ double __hip_fast_dsqrt_rn(double x) {
+  return hc::fast_math::sqrt(x);
+}
+
+__device__ double __hip_fast_dsqrt_ru(double x) {
+  return hc::fast_math::sqrt(x);
+}
+
+__device__ double __hip_fast_dsqrt_rz(double x) {
+  return hc::fast_math::sqrt(x);
+}
 
 __HIP_DEVICE__ char1 make_char1(signed char x)
 {
diff --git a/projects/hip/tests/src/deviceLib/hipFloatMath.cpp b/projects/hip/tests/src/deviceLib/hipFloatMath.cpp
new file mode 100644
index 0000000000..eb70eb6b0b
--- /dev/null
+++ b/projects/hip/tests/src/deviceLib/hipFloatMath.cpp
@@ -0,0 +1,61 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s
+ * RUN: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+
+#define LEN 512
+#define SIZE LEN<<2
+
+
+
+__global__ void floatMath(hipLaunchParm lp, float *In, float *Out) {
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  Out[tid] = __cosf(In[tid]);
+  Out[tid] = __exp10f(Out[tid]);
+  Out[tid] = __expf(Out[tid]);
+  Out[tid] = __frsqrt_rn(Out[tid]);
+  Out[tid] = __fsqrt_rd(Out[tid]);
+  Out[tid] = __fsqrt_rn(Out[tid]);
+  Out[tid] = __fsqrt_ru(Out[tid]);
+  Out[tid] = __fsqrt_rz(Out[tid]);
+  Out[tid] = __log10f(Out[tid]);
+  Out[tid] = __log2f(Out[tid]);
+  Out[tid] = __logf(Out[tid]);
+  Out[tid] = __powf(2.0f, Out[tid]);
+  __sincosf(Out[tid], &In[tid], &Out[tid]);
+  Out[tid] = __sinf(Out[tid]);
+  Out[tid] = __cosf(Out[tid]);
+  Out[tid] = __tanf(Out[tid]);
+}
+
+int main(){
+  float *Inh, *Outh, *Ind, *Outd;
+  hipMalloc((void**)&Ind, SIZE);
+  hipMalloc((void**)&Outd, SIZE);
+  hipLaunchKernel(floatMath, dim3(LEN,1,1), dim3(1,1,1), 0, 0, Ind, Outd);
+}